source: trunk/gsdl/src/mgpp/text/mg_passes.cpp@ 2468

Last change on this file since 2468 was 2468, checked in by sjboddie, 23 years ago

Fiddled about with mgpp to get it compiling on Windows under VC++ 6.0. I
still can't get it to compile under VC++ 4.2 because of some weird
behaviour in STLport.

Also tidied up a little and removed some of the old log information
that was scattered about in some of the files.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.4 KB
Line 
1/**************************************************************************
2 *
3 * mg_passes.cpp -- Driver for the various passes
4 * Copyright (C) 1994 Neil Sharman
5 * Copyright (C) 1999 Rodger McNab
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 **************************************************************************/
22
23#define _XOPEN_SOURCE 1
24#define _XOPEN_SOURCE_EXTENDED 1
25
26// need this to avoid bizarre compiler problems under VC++ 6.0
27#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
28# include <iostream>
29#endif
30
31#include "sysfuncs.h"
32
33#ifdef HAVE_MALLINFO
34# include <malloc.h>
35#endif
36
37#if defined __WIN32__
38# include <io.h>
39# include "getopt.h"
40# define close _close
41# define open _open
42#else
43# include <unistd.h>
44#endif
45
46#include "memlib.h"
47#include "messages.h"
48#include "longlong.h"
49#include "mg_files.h"
50#include "mg.h"
51#include "build.h"
52#include "text.h"
53#include "stemmer.h"
54#include "FileBuf.h"
55#include "TextEl.h"
56#include "TagInfo.h"
57
58#define MAX_PASSES 5
59
60#define SPECIAL 1
61#define TEXT_PASS_1 2
62#define TEXT_PASS_2 4
63#define IVF_PASS_1 8
64#define IVF_PASS_2 16
65
66#define MIN_BUF 8192
67
68
69unsigned long invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */
70
71static char Passes = 0;
72static char **files = NULL;
73static int num_files = 0;
74
75
76struct pass_data {
77 char *name;
78 int (*init) (const TagInfo &tagInfo, char *);
79 int (*process) (const TagInfo &tagInfo, const TextElArray &doc);
80 int (*done) (const TagInfo &tagInfo, char *);
81};
82
83
84static pass_data PassData[MAX_PASSES] = {
85 {"special", init_special, process_special, done_special},
86 {"text.pass1", init_text_1, process_text_1, done_text_1},
87 {"text.pass2", init_text_2, process_text_2, done_text_2},
88 {"ivf.pass1", init_ivf_1, process_ivf_1, done_ivf_1},
89 {"ivf.pass2", init_ivf_2, process_ivf_2, done_ivf_2},
90};
91
92static char *usage_str = "\nUSAGE:\n"
93" %s [-J doc-tag] [-K level-tag] [-L index-level]\n"
94" %*s [-m invf-memory] [-T1] [-T2] [-I1] [-I2] [-S]\n"
95" %*s [-C] [-h] [-d directory] -f name\n\n";
96
97
98
99static void Usage (char *err) {
100 if (err) Message (err);
101 fprintf (stderr, usage_str, msg_prefix, strlen (msg_prefix), "",
102 strlen (msg_prefix), "");
103 exit (1);
104}
105
106
107
108
109int OpenNextFile (int in_fd) {
110 if (in_fd > 0) close (in_fd);
111 if (num_files == 0) return (-1);
112 if ((in_fd = open (files[0], O_RDONLY)) == -1)
113 FatalError (1, "Cannot open %s", files[0]);
114 files++;
115 num_files--;
116 return (in_fd);
117}
118
119
120static void Driver (int in_fd, char *file_name,
121 const TagInfo &tagInfo, bool compatMode) {
122 // cout << tagInfo;
123
124 int pass;
125
126 unsigned long numBytes = 0;
127 unsigned long numDocs = 0;
128
129 // initialise all the passes
130 for (pass = 0; pass < MAX_PASSES; pass++) {
131 if (Passes & (1 << pass)) {
132 if (PassData[pass].init (tagInfo, file_name) == COMPERROR)
133 FatalError (1, "Error during init of \"%s\"", PassData[pass].name);
134 }
135 }
136
137
138 // set up various variables
139 FileBuf buf;
140 TextElArray doc;
141 unsigned long docLen = 0;
142
143 // read and process each file (start with an open file)
144 do {
145
146 // read and process each document in this file
147 buf.SetFD (in_fd);
148 while (ReadDoc (buf, tagInfo.docTag, doc, docLen, compatMode)) {
149
150 // give this document to each pass
151 for (pass = 0; pass < MAX_PASSES; pass++) {
152 if (Passes & (1 << pass)) {
153 if (PassData[pass].process (tagInfo, doc) == COMPERROR)
154 FatalError(1,"Error during processing of \"%s\"",PassData[pass].name);
155 }
156 }
157
158 // another document has been processed
159 numBytes += docLen;
160 numDocs++;
161 }
162
163 } while ((in_fd = OpenNextFile (in_fd)) > 0);
164
165
166 // do done for each pass
167 for (pass = 0; pass < MAX_PASSES; pass++) {
168 if (Passes & (1 << pass)) {
169 if (PassData[pass].done (tagInfo, file_name) == COMPERROR)
170 FatalError (1, "Error during done of \"%s\"", PassData[pass].name);
171 }
172 }
173}
174
175int main (int argc, char **argv) {
176 int ch, in_fd;
177 char *filename = NULL;
178 bool compatMode = false;
179 TagInfo tagInfo;
180 tagInfo.SetDocTag ("Document");
181
182 msg_prefix = argv[0];
183
184 opterr = 0;
185 while ((ch=getopt(argc, argv, "J:K:L:f:d:m:I:T:SCh"))!=-1){
186 switch (ch) {
187 case 'J':
188 tagInfo.SetDocTag (optarg);
189 break;
190 case 'K':
191 tagInfo.AddLevelTag (optarg);
192 break;
193 case 'L':
194 tagInfo.SetIndexLevel (optarg);
195 break;
196 case 'f':
197 filename = optarg;
198 break;
199 case 'd':
200 set_basepath (optarg);
201 break;
202 case 'm':
203 invf_buffer_size = (int) (atof (optarg) * 1024 * 1024);
204 break;
205 case 'I':
206 if (*optarg == '1')
207 Passes |= IVF_PASS_1;
208 else if (*optarg == '2')
209 Passes |= IVF_PASS_2;
210 else
211 Usage ("Invalid pass number");
212 break;
213 case 'T':
214 if (*optarg == '1')
215 Passes |= TEXT_PASS_1;
216 else if (*optarg == '2')
217 Passes |= TEXT_PASS_2;
218 else
219 Usage ("Invalid pass number");
220 break;
221 case 'S':
222 Passes |= SPECIAL;
223 break;
224 case 'C':
225 compatMode = true;
226 break;
227 case 'h':
228 case '?':
229 Usage (NULL);
230 }
231 }
232
233 if (!filename || *filename == '\0')
234 FatalError (1, "A document collection name must be specified.");
235
236 if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2))
237 FatalError (1, "I1 and I2 cannot be done simultaneously.");
238
239 if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2))
240 FatalError (1, "T1 and T2 cannot be done simultaneously.");
241
242 if (!Passes)
243 FatalError (1, "S, T1, T2, I1 or I2 must be specified.");
244
245 if (optind < argc) {
246 if ((in_fd = open (argv[optind], O_RDONLY)) == -1)
247 FatalError (1, "Cannot open %s", argv[optind]);
248 files = &argv[optind + 1];
249 num_files = argc - (optind + 1);
250
251 } else in_fd = 0; // stdin
252
253
254 if (compatMode) tagInfo.SetDocTag ("Document");
255
256 // a document tag is also a level tag
257 tagInfo.levelTags.insert (tagInfo.docTag);
258
259 Driver (in_fd, filename, tagInfo, compatMode);
260
261 return (0);
262}
Note: See TracBrowser for help on using the repository browser.