source: trunk/gsdl/src/mgpp/text/mg_passes.cpp@ 2442

Last change on this file since 2442 was 2442, checked in by jrm21, 23 years ago

portability changes, use getopt from unistd.h (all POSIX systems)

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.4 KB
Line 
1/**************************************************************************
2 *
3 * mg_passes.cpp -- Driver for the various passes
4 * Copyright (C) 1994 Neil Sharman
5 * Copyright (C) 1999 Rodger McNab
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * $Id: mg_passes.cpp 2442 2001-05-17 04:38:16Z jrm21 $
22 *
23 **************************************************************************/
24
25#define _XOPEN_SOURCE 1
26#define _XOPEN_SOURCE_EXTENDED 1
27
28#include "sysfuncs.h"
29
30#ifdef HAVE_MALLINFO
31# include <malloc.h>
32#endif
33
34#include <unistd.h>
35
36#include "memlib.h"
37#include "messages.h"
38
39#include "longlong.h"
40
41#include "mg_files.h"
42#include "mg.h"
43#include "build.h"
44#include "text.h"
45#include "stemmer.h"
46
47#include "FileBuf.h"
48#include "TextEl.h"
49#include "TagInfo.h"
50
51
52/*
53 $Log$
54 Revision 1.2 2001/05/17 04:38:16 jrm21
55 portability changes, use getopt from unistd.h (all POSIX systems)
56
57 Revision 1.1 2000/01/14 02:26:18 sjboddie
58 Rodgers new C++ mg
59
60 Revision 1.1 1999/10/11 02:58:00 cs025
61 Base install of MG-PP
62
63 Revision 1.1 1999/08/10 21:18:12 sjboddie
64 renamed mg-1.3d directory mg
65
66 Revision 1.3 1998/12/17 09:12:53 rjmcnab
67
68 Altered mg to process utf-8 encoded Unicode. The main changes
69 are in the parsing of the input, the casefolding, and the stemming.
70
71 Revision 1.2 1998/11/25 07:55:47 rjmcnab
72
73 Modified mg to that you can specify the stemmer you want
74 to use via a command line option. You specify it to
75 mg_passes during the build process. The number of the
76 stemmer that you used is stored within the inverted
77 dictionary header and the stemmed dictionary header so
78 the correct stemmer is used in later stages of building
79 and querying.
80
81 Revision 1.1 1998/11/17 09:35:13 rjmcnab
82 *** empty log message ***
83
84 * Revision 1.3 1994/10/20 03:56:57 tes
85 * I have rewritten the boolean query optimiser and abstracted out the
86 * components of the boolean query.
87 *
88 * Revision 1.2 1994/09/20 04:41:52 tes
89 * For version 1.1
90 *
91 */
92
93#define MAX_PASSES 5
94
95#define SPECIAL 1
96#define TEXT_PASS_1 2
97#define TEXT_PASS_2 4
98#define IVF_PASS_1 8
99#define IVF_PASS_2 16
100
101#define MIN_BUF 8192
102
103
104unsigned long invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */
105
106static char Passes = 0;
107static char **files = NULL;
108static int num_files = 0;
109
110
111struct pass_data {
112 char *name;
113 int (*init) (const TagInfo &tagInfo, char *);
114 int (*process) (const TagInfo &tagInfo, const TextElArray &doc);
115 int (*done) (const TagInfo &tagInfo, char *);
116};
117
118
119static pass_data PassData[MAX_PASSES] = {
120 {"special", init_special, process_special, done_special},
121 {"text.pass1", init_text_1, process_text_1, done_text_1},
122 {"text.pass2", init_text_2, process_text_2, done_text_2},
123 {"ivf.pass1", init_ivf_1, process_ivf_1, done_ivf_1},
124 {"ivf.pass2", init_ivf_2, process_ivf_2, done_ivf_2},
125};
126
127static char *usage_str = "\nUSAGE:\n"
128" %s [-J doc-tag] [-K level-tag] [-L index-level]\n"
129" %*s [-m invf-memory] [-T1] [-T2] [-I1] [-I2] [-S]\n"
130" %*s [-C] [-h] [-d directory] -f name\n\n";
131
132
133
134static void Usage (char *err) {
135 if (err) Message (err);
136 fprintf (stderr, usage_str, msg_prefix, strlen (msg_prefix), "",
137 strlen (msg_prefix), "");
138 exit (1);
139}
140
141
142
143
144int OpenNextFile (int in_fd) {
145 if (in_fd > 0) close (in_fd);
146 if (num_files == 0) return (-1);
147 if ((in_fd = open (files[0], O_RDONLY)) == -1)
148 FatalError (1, "Cannot open %s", files[0]);
149 files++;
150 num_files--;
151 return (in_fd);
152}
153
154
155static void Driver (int in_fd, char *file_name,
156 const TagInfo &tagInfo, bool compatMode) {
157 // cout << tagInfo;
158
159 int pass;
160
161 unsigned long numBytes = 0;
162 unsigned long numDocs = 0;
163
164 // initialise all the passes
165 for (pass = 0; pass < MAX_PASSES; pass++) {
166 if (Passes & (1 << pass)) {
167 if (PassData[pass].init (tagInfo, file_name) == COMPERROR)
168 FatalError (1, "Error during init of \"%s\"", PassData[pass].name);
169 }
170 }
171
172
173 // set up various variables
174 FileBuf buf;
175 TextElArray doc;
176 unsigned long docLen = 0;
177
178 // read and process each file (start with an open file)
179 do {
180
181 // read and process each document in this file
182 buf.SetFD (in_fd);
183 while (ReadDoc (buf, tagInfo.docTag, doc, docLen, compatMode)) {
184
185 // give this document to each pass
186 for (pass = 0; pass < MAX_PASSES; pass++) {
187 if (Passes & (1 << pass)) {
188 if (PassData[pass].process (tagInfo, doc) == COMPERROR)
189 FatalError(1,"Error during processing of \"%s\"",PassData[pass].name);
190 }
191 }
192
193 // another document has been processed
194 numBytes += docLen;
195 numDocs++;
196 }
197
198 } while ((in_fd = OpenNextFile (in_fd)) > 0);
199
200
201 // do done for each pass
202 for (pass = 0; pass < MAX_PASSES; pass++) {
203 if (Passes & (1 << pass)) {
204 if (PassData[pass].done (tagInfo, file_name) == COMPERROR)
205 FatalError (1, "Error during done of \"%s\"", PassData[pass].name);
206 }
207 }
208}
209
210
211
212int main (int argc, char **argv) {
213 int ch, in_fd;
214 char *filename = NULL;
215 bool compatMode = false;
216 TagInfo tagInfo;
217 tagInfo.SetDocTag ("Document");
218
219 msg_prefix = argv[0];
220
221 opterr = 0;
222 while ((ch=getopt(argc, argv, "J:K:L:f:d:m:I:T:SCh"))!=-1){
223 switch (ch) {
224 case 'J':
225 tagInfo.SetDocTag (optarg);
226 break;
227 case 'K':
228 tagInfo.AddLevelTag (optarg);
229 break;
230 case 'L':
231 tagInfo.SetIndexLevel (optarg);
232 break;
233 case 'f':
234 filename = optarg;
235 break;
236 case 'd':
237 set_basepath (optarg);
238 break;
239 case 'm':
240 invf_buffer_size = (int) (atof (optarg) * 1024 * 1024);
241 break;
242 case 'I':
243 if (*optarg == '1')
244 Passes |= IVF_PASS_1;
245 else if (*optarg == '2')
246 Passes |= IVF_PASS_2;
247 else
248 Usage ("Invalid pass number");
249 break;
250 case 'T':
251 if (*optarg == '1')
252 Passes |= TEXT_PASS_1;
253 else if (*optarg == '2')
254 Passes |= TEXT_PASS_2;
255 else
256 Usage ("Invalid pass number");
257 break;
258 case 'S':
259 Passes |= SPECIAL;
260 break;
261 case 'C':
262 compatMode = true;
263 break;
264 case 'h':
265 case '?':
266 Usage (NULL);
267 }
268 }
269
270 if (!filename || *filename == '\0')
271 FatalError (1, "A document collection name must be specified.");
272
273 if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2))
274 FatalError (1, "I1 and I2 cannot be done simultaneously.");
275
276 if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2))
277 FatalError (1, "T1 and T2 cannot be done simultaneously.");
278
279 if (!Passes)
280 FatalError (1, "S, T1, T2, I1 or I2 must be specified.");
281
282 if (optind < argc) {
283 if ((in_fd = open (argv[optind], O_RDONLY)) == -1)
284 FatalError (1, "Cannot open %s", argv[optind]);
285 files = &argv[optind + 1];
286 num_files = argc - (optind + 1);
287
288 } else in_fd = 0; // stdin
289
290
291 if (compatMode) tagInfo.SetDocTag ("Document");
292
293 // a document tag is also a level tag
294 tagInfo.levelTags.insert (tagInfo.docTag);
295
296 Driver (in_fd, filename, tagInfo, compatMode);
297
298 exit (0);
299}
Note: See TracBrowser for help on using the repository browser.