source: trunk/gsdl/packages/mg-1.3d/src/text/MGPASS.C@ 28

Last change on this file since 28 was 28, checked in by rjmcnab, 25 years ago

Fixed a small error in the windows compilation

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 12.5 KB
Line 
1/**************************************************************************
2 *
3 * mgpass.cpp -- Driver for the various passes -
4
5 V1 - removed all the pipe processing and replaced with
6 code to directly explore a directory of files.
7 V2 - rebuilt to extract non text files from web browser
8 'catch' file. Also to display progress count
9 GH/WJR
10
11 * Copyright (C) 1994 Neil Sharman, ..
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26 *
27 * $Id: MGPASS.C 28 1998-11-24 03:39:23Z rjmcnab $
28 *
29 **************************************************************************/
30
31#include "sysfuncs.h"
32
33#include <malloc.h>
34#include <stdlib.h>
35#include <stdio.h>
36#include <io.h>
37#include <fcntl.h>
38#include <string.h>
39
40#include "memlib.h"
41#include "messages.h"
42
43#include "mg_files.h"
44#include "mg.h"
45#include "build.h"
46#include "text.h"
47#include "stemmer.h"
48
49/*
50 $Log$
51 Revision 1.3 1998/11/24 03:39:23 rjmcnab
52
53 Fixed a small error in the windows compilation
54
55 Revision 1.2 1998/11/24 01:29:39 rjmcnab
56
57 Fixed a few problems with the windows build
58
59 Revision 1.1 1998/11/17 09:34:12 rjmcnab
60 *** empty log message ***
61
62 * Revision 1.3 1994/10/20 03:56:57 tes
63 * I have rewritten the boolean query optimiser and abstracted out the
64 * components of the boolean query.
65 *
66 * Revision 1.2 1994/09/20 04:41:52 tes
67 * For version 1.1
68 *
69 */
70
71static char *RCSID = "$Id: MGPASS.C 28 1998-11-24 03:39:23Z rjmcnab $";
72
73#define MAX_PASSES 5
74
75#define SPECIAL 1
76#define TEXT_PASS_1 2
77#define TEXT_PASS_2 4
78#define IVF_PASS_1 8
79#define IVF_PASS_2 16
80
81#define MIN_BUF 8192
82#define path_length 256
83
84unsigned long buf_size = 3 * 1024 * 1024; /* 3Mb */
85unsigned long invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */
86unsigned long ChunkLimit = 0;
87char InvfLevel = 2;
88char SkipSGML = 0;
89char MakeWeights = 0;
90FILE *Comp_Stats = NULL;
91int comp_stat_point = 0;
92double bytes_processed = 0;
93int num_docs = 0;
94double bytes_received = 0;
95int stem_method = 0;
96
97static char Passes = 0;
98static unsigned long trace = 0;
99static int Dump = 0;
100static char **files = NULL;
101static int num_files = 0;
102static char *trace_name = NULL;
103
104static char dirname[path_length], wildname[path_length];
105static int by_para = 0, recurse = 0, html_catch = 0;
106static char *buffer;
107char *line_start, *data_end, *base, *scan;
108
109typedef struct pass_data
110 {
111 char *name;
112 int (*init) (char *);
113 int (*process) (u_char *, int);
114 int (*done) (char *);
115 }
116pass_data;
117
118static pass_data PassData[MAX_PASSES] =
119{
120 {"special", init_special, process_special, done_special},
121 {"text.pass1", init_text_1, process_text_1, done_text_1},
122 {"text.pass2", init_text_2, process_text_2, done_text_2},
123 {"ivf.pass1", init_ivf_1, process_ivf_1, done_ivf_1},
124 {"ivf.pass2", init_ivf_2, process_ivf_2, done_ivf_2},
125};
126
127static char *usage_str = "\nUSAGE:\n"
128" %s [-h] [-G] [-D] [-1|-2|-3] [-T1] [-T2] [-I1] [-I2] [-N1]\n"
129" %*s [-N2] [-W] [-S] [-b buffer-size] [-d dictionary-directory]\n"
130" %*s [-t trace-point Mb] [-m invf-memory] [-c chunk-limit]\n"
131" %*s [-n trace-name] [-C comp-stat-size] [-s stem_method] -f doc-collection-name\n"
132" %*s [source directory\\] [source file]\n";
133
134static void usage (char *err)
135{
136 if (err) Message (err);
137 fprintf (stderr, usage_str, msg_prefix, strlen (msg_prefix), "",
138 strlen (msg_prefix), "", strlen (msg_prefix), "");
139 exit (1);
140}
141
142void do_process(char *buffer, int num)
143{
144 int pass;
145 if (num == 0) Message ("Warning : Processing zero length document");
146 num_docs++;
147 bytes_processed += num;
148 for (pass = 0; pass < MAX_PASSES; pass++)
149 if (Passes & (1 << pass))
150 if (PassData[pass].process((u_char *)buffer, num) == COMPERROR)
151 FatalError(1, "Error during file processing");
152}
153
154int refill(int in_fd)
155{
156 int num, bitleft;
157 bitleft = data_end - base;
158 memmove(buffer, base, bitleft);
159 if (buf_size - bitleft < MIN_BUF)
160 FatalError(1, "Paragraph too big for buffer");
161 num = read(in_fd, &buffer[bitleft], buf_size - bitleft);
162 line_start -= (base - buffer);
163 scan -= (base - buffer);
164 base = buffer;
165 data_end = buffer + bitleft + num;
166 if (num > 0) return 1; else return 0;
167}
168
169char *scanpara(int in_fd)
170{
171 int num, blank, in_blank, at_end;
172 num = read(in_fd,buffer,buf_size);
173 in_blank = 1; base = buffer; line_start = buffer; scan = buffer;
174 data_end = buffer + num; at_end = 0;
175 for (;;) {
176 blank = 1; line_start = scan; // Get a line
177 while (scan < data_end && *scan != '\n')
178 if (*scan++ > ' ') blank = 0;
179 if (scan >= data_end) {
180 if (refill(in_fd)) {
181 while (scan < data_end && *scan != '\n')
182 if (*scan++ > ' ') blank = 0;
183 if (scan < data_end) scan++;
184 }
185 else at_end = 1;
186 }
187 else scan++;
188 if (line_start < scan) { // If we have a line
189 if (in_blank) {
190 if (!blank) { in_blank = 0; base = line_start; }
191 }
192 else {
193 if (blank) {
194 do_process(base, line_start - base);
195 in_blank = 1; base = line_start;
196 }
197 }
198 }
199 if (at_end) break;
200 }
201 if (in_blank) base = scan;
202 if (scan + 2 <= buffer + buf_size) {
203 *scan++ = 26; *scan++ = 10;
204 }
205 if (base < scan) do_process(base, scan - base);
206 return NULL;
207}
208
209char *scanfile(int in_fd)
210{
211 int num = read(in_fd,buffer,buf_size); /*expect to read the whole file*/
212 if (num < 0) return "file locked";
213 if (num >= buf_size-1) return "file too long";
214 do_process(buffer, num);
215 return NULL;
216}
217
218void search(char *dname, char *fname)
219{
220 long dirtag; struct _finddata_t dirinfo; int in_fd; char *res;
221 char search_name[path_length], found_name[path_length];
222
223 sprintf(search_name, "%s%s", dname, fname); /*Scan files*/
224 dirtag = _findfirst(search_name, &dirinfo);
225 if (dirtag >= 0) {
226 do {
227 if ((dirinfo.attrib & (_A_SUBDIR | _A_HIDDEN | _A_SYSTEM)) == 0) {
228 sprintf(found_name,"%s%s",dname,dirinfo.name);
229 in_fd = open(found_name,O_RDONLY|O_BINARY);
230 if (in_fd >= 0) {
231 if (by_para)
232 res = scanpara(in_fd);
233 else
234 res = scanfile(in_fd);
235 if (res != NULL) {
236 Message("Error %s in processing file %s\n", res, found_name);
237 exit(1);
238 }
239 close(in_fd);
240 }
241 }
242 } while (_findnext(dirtag, &dirinfo) == 0);
243 _findclose(dirtag);
244 }
245
246 if (recurse == 0) return;
247
248 sprintf(search_name, "%s*.*", dname); /*Look for subdirs*/
249 dirtag = _findfirst(search_name, &dirinfo);
250 if (dirtag < 0) return;
251 do {
252 if ( ((dirinfo.attrib & (_A_HIDDEN | _A_SYSTEM)) == 0) &&
253 ((dirinfo.attrib & _A_SUBDIR) != 0) &&
254 strcmp(dirinfo.name,".") != 0 &&
255 strcmp(dirinfo.name,"..") != 0) {
256 sprintf(found_name,"%s%s",dname,dirinfo.name);
257 strcat(found_name,"\\");
258 search(found_name,fname);
259 }
260 } while (_findnext(dirtag, &dirinfo) == 0);
261 _findclose(dirtag);
262}
263
264static int toobig(int n)
265{
266 if (n > path_length) {
267 printf("Cannot handle urls > %d characters in length\n", path_length);
268 exit(1);
269 }
270 return 0;
271}
272
273void scan_catch(char *dname, char *fname)
274{
275 int in_fd, urllen, conlen, filesize; char catch_name[path_length];
276 int filecount = 0;
277 enum { filekind_redirected, filekind_text, filekind_other } filekind;
278 sprintf(catch_name, "%s%s", dname, fname);
279 in_fd = open(catch_name,O_RDONLY|O_BINARY);
280 if (in_fd < 0)
281 FatalError(1, "Couldn't open catch file \"%s\"", catch_name);
282 for (;;) {
283 filecount++; if (filecount%100 == 0) { printf("%d\r", filecount); fflush(stdout); }
284 if (read(in_fd, &urllen, sizeof(int)) != sizeof(int) || toobig(urllen) ||
285 read(in_fd, catch_name, urllen) != urllen ||
286 read(in_fd, &conlen, sizeof(int)) != sizeof(int) || toobig(conlen) ||
287 read(in_fd, catch_name, conlen) != conlen) break;
288 if (conlen >= 1 && *catch_name == '@')
289 filekind = filekind_redirected;
290 else {
291 if (conlen >= 4 && strncmp(catch_name, "text", 4) == 0)
292 filekind = filekind_text;
293 else
294 filekind = filekind_other;
295 if (read(in_fd, &filesize, sizeof(int)) != sizeof(int))
296 FatalError(1, "File read failed for size field");
297 if (filesize > buf_size)
298 FatalError(1, "File too large (%d > %d)", filesize, buf_size);
299 if (read(in_fd, buffer, filesize) != filesize)
300 FatalError(1, "Failed to read file data");
301 }
302 if (filekind == filekind_text) do_process(buffer, filesize);
303 }
304 close(in_fd);
305}
306
307static void driver (FILE * Trace, char *file_name)
308{
309 int pass;
310
311 buffer = (char *)Xmalloc (buf_size);
312
313 for (pass = 0; pass < MAX_PASSES; pass++)
314 if (Passes & (1 << pass))
315 {
316 if (PassData[pass].init (file_name) == COMPERROR)
317 FatalError (1, "Error during init of \"%s\"",PassData[pass].name);
318 }
319
320 if (html_catch == 0) search(dirname, wildname);
321 else scan_catch(dirname, wildname);
322
323 for (pass = 0; pass < MAX_PASSES; pass++)
324 if (Passes & (1 << pass))
325 {
326 if (PassData[pass].done (file_name) == COMPERROR)
327 FatalError (1, "Error during done of \"%s\"", PassData[pass].name);
328 }
329
330
331 free (buffer);
332}
333
334void main (int argc, char **argv)
335{
336 int ch;
337 char *filename = NULL;
338 FILE *Trace = NULL;
339
340 msg_prefix = argv[0];
341
342 opterr = 0;
343 while ((ch = getopt (argc, argv, "hC:WHGpSD123f:d:b:T:I:t:m:N:c:n:s:")) != -1)
344 {
345 switch (ch)
346 {
347 case 'H':
348 html_catch = 1;
349 break;
350 case 'G':
351 SkipSGML = 1;
352 break;
353 case 'p':
354 by_para = 1;
355 break;
356 case 'S':
357 Passes |= SPECIAL;
358 break;
359 case '1':
360 InvfLevel = 1;
361 break;
362 case '2':
363 InvfLevel = 2;
364 break;
365 case '3':
366 InvfLevel = 3;
367 break;
368 case 'f':
369 filename = optarg;
370 break;
371 case 'n':
372 trace_name = optarg;
373 break;
374 case 'D':
375 Dump = 1;
376 break;
377 case 'W':
378 MakeWeights = 1;
379 break;
380 case 'd':
381 set_basepath (optarg);
382 break;
383 case 's':
384 stem_method = atoi (optarg) & STEMMER_MASK;
385 break;
386 case 'b':
387 buf_size = atoi (optarg) * 1024;
388 break;
389 case 'C':
390 comp_stat_point = atoi (optarg) * 1024;
391 break;
392 case 'c':
393 ChunkLimit = atoi (optarg);
394 break;
395 case 'm':
396 invf_buffer_size = (int) (atof (optarg) * 1024 * 1024);
397 break;
398 case 'I':
399 case 'N': /* N kept for compatability */
400 if (*optarg == '1')
401 Passes |= IVF_PASS_1;
402 else if (*optarg == '2')
403 Passes |= IVF_PASS_2;
404 else
405 usage ("Invalid pass number");
406 break;
407 case 'T':
408 if (*optarg == '1')
409 Passes |= TEXT_PASS_1;
410 else if (*optarg == '2')
411 Passes |= TEXT_PASS_2;
412 else
413 usage ("Invalid pass number");
414 break;
415 case 't':
416 trace = (unsigned long) (atof (optarg) * 1024 * 1024);
417 break;
418 case 'h':
419 case '?':
420 usage (NULL);
421 }
422 }
423
424 if (!filename || *filename == '\0')
425 FatalError (1, "A document collection name must be specified.");
426
427 if (buf_size < MIN_BUF)
428 FatalError (1, "The buffer size must exceed 1024 bytes.");
429
430 if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2))
431 FatalError (1, "I1 and I2 cannot be done simultaneously.");
432
433 if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2))
434 FatalError (1, "T1 and T2 cannot be done simultaneously.");
435
436 if (!Passes)
437 FatalError (1, "S, T1, T2, I1 or I2 must be specified.");
438
439 if (argc - optind == 1) {
440 strcpy(dirname,"");
441 strcpy(wildname,argv[optind]);
442 }
443 else if (argc - optind == 2) {
444 strcpy(dirname,argv[optind]);
445 strcpy(wildname,argv[optind+1]);
446 }
447 else FatalError(1, "Finder code requires directory and filespec.");
448
449 if (strrchr(wildname,'*') != NULL || strrchr(wildname,'?') != NULL)
450 recurse = 1;
451
452 if (trace)
453 {
454 if (!trace_name)
455 trace_name = make_name (filename, TRACE_SUFFIX, NULL);
456 if (!(Trace = fopen (trace_name, "a")))
457 Message ("Unable to open \"%s\". No tracing will be done.", trace_name);
458 else
459 setbuf (Trace, NULL);
460 }
461 else
462 Trace = NULL;
463
464 if (comp_stat_point)
465 {
466 char *name = make_name (filename, COMPRESSION_STATS_SUFFIX, NULL);
467 if (!(Comp_Stats = fopen (name, "wb")))
468 Message ("Unable to open \"%s\". No comp. stats. will be generated.",
469 name);
470 }
471
472
473 if (Trace)
474 {
475 int i;
476 fprintf (Trace, "\n\n\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n");
477 for (i = 0; i < argc; i++)
478 fprintf (Trace, "%s ", argv[i]);
479 fprintf (Trace, "\n\n");
480 }
481
482 driver (Trace, filename);
483
484 if (Trace)
485 fclose (Trace);
486
487 if (Comp_Stats)
488 fclose (Comp_Stats);
489
490 exit (0);
491}
492
Note: See TracBrowser for help on using the repository browser.