source: trunk/gsdl/src/mgpp/text/mg_passes.cpp@ 856

Last change on this file since 856 was 856, checked in by sjboddie, 24 years ago

Rodgers new C++ mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.2 KB
Line 
1/**************************************************************************
2 *
3 * mg_passes.cpp -- Driver for the various passes
4 * Copyright (C) 1994 Neil Sharman
5 * Copyright (C) 1999 Rodger McNab
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * $Id: mg_passes.cpp 856 2000-01-14 02:26:25Z sjboddie $
22 *
23 **************************************************************************/
24
25#include "sysfuncs.h"
26
27#ifdef HAVE_MALLINFO
28# include <malloc.h>
29#endif
30
31#include "memlib.h"
32#include "messages.h"
33
34#include "longlong.h"
35
36#include "mg_files.h"
37#include "mg.h"
38#include "build.h"
39#include "text.h"
40#include "stemmer.h"
41
42#include "FileBuf.h"
43#include "TextEl.h"
44#include "TagInfo.h"
45
46
47/*
48 $Log$
49 Revision 1.1 2000/01/14 02:26:18 sjboddie
50 Rodgers new C++ mg
51
52 Revision 1.1 1999/10/11 02:58:00 cs025
53 Base install of MG-PP
54
55 Revision 1.1 1999/08/10 21:18:12 sjboddie
56 renamed mg-1.3d directory mg
57
58 Revision 1.3 1998/12/17 09:12:53 rjmcnab
59
60 Altered mg to process utf-8 encoded Unicode. The main changes
61 are in the parsing of the input, the casefolding, and the stemming.
62
63 Revision 1.2 1998/11/25 07:55:47 rjmcnab
64
65 Modified mg to that you can specify the stemmer you want
66 to use via a command line option. You specify it to
67 mg_passes during the build process. The number of the
68 stemmer that you used is stored within the inverted
69 dictionary header and the stemmed dictionary header so
70 the correct stemmer is used in later stages of building
71 and querying.
72
73 Revision 1.1 1998/11/17 09:35:13 rjmcnab
74 *** empty log message ***
75
76 * Revision 1.3 1994/10/20 03:56:57 tes
77 * I have rewritten the boolean query optimiser and abstracted out the
78 * components of the boolean query.
79 *
80 * Revision 1.2 1994/09/20 04:41:52 tes
81 * For version 1.1
82 *
83 */
84
85#define MAX_PASSES 5
86
87#define SPECIAL 1
88#define TEXT_PASS_1 2
89#define TEXT_PASS_2 4
90#define IVF_PASS_1 8
91#define IVF_PASS_2 16
92
93#define MIN_BUF 8192
94
95
96unsigned long invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */
97
98static char Passes = 0;
99static char **files = NULL;
100static int num_files = 0;
101
102
103struct pass_data {
104 char *name;
105 int (*init) (const TagInfo &tagInfo, char *);
106 int (*process) (const TagInfo &tagInfo, const TextElArray &doc);
107 int (*done) (const TagInfo &tagInfo, char *);
108};
109
110
111static pass_data PassData[MAX_PASSES] = {
112 {"special", init_special, process_special, done_special},
113 {"text.pass1", init_text_1, process_text_1, done_text_1},
114 {"text.pass2", init_text_2, process_text_2, done_text_2},
115 {"ivf.pass1", init_ivf_1, process_ivf_1, done_ivf_1},
116 {"ivf.pass2", init_ivf_2, process_ivf_2, done_ivf_2},
117};
118
119static char *usage_str = "\nUSAGE:\n"
120" %s [-J doc-tag] [-K level-tag] [-L index-level]\n"
121" %*s [-m invf-memory] [-T1] [-T2] [-I1] [-I2] [-S]\n"
122" %*s [-C] [-h] [-d directory] -f name\n\n";
123
124
125
126static void Usage (char *err) {
127 if (err) Message (err);
128 fprintf (stderr, usage_str, msg_prefix, strlen (msg_prefix), "",
129 strlen (msg_prefix), "");
130 exit (1);
131}
132
133
134
135
136int OpenNextFile (int in_fd) {
137 if (in_fd > 0) close (in_fd);
138 if (num_files == 0) return (-1);
139 if ((in_fd = open (files[0], O_RDONLY)) == -1)
140 FatalError (1, "Cannot open %s", files[0]);
141 files++;
142 num_files--;
143 return (in_fd);
144}
145
146
147static void Driver (int in_fd, char *file_name,
148 const TagInfo &tagInfo, bool compatMode) {
149 // cout << tagInfo;
150
151 int pass;
152
153 unsigned long numBytes = 0;
154 unsigned long numDocs = 0;
155
156 // initialise all the passes
157 for (pass = 0; pass < MAX_PASSES; pass++) {
158 if (Passes & (1 << pass)) {
159 if (PassData[pass].init (tagInfo, file_name) == COMPERROR)
160 FatalError (1, "Error during init of \"%s\"", PassData[pass].name);
161 }
162 }
163
164
165 // set up various variables
166 FileBuf buf;
167 TextElArray doc;
168 unsigned long docLen = 0;
169
170 // read and process each file (start with an open file)
171 do {
172
173 // read and process each document in this file
174 buf.SetFD (in_fd);
175 while (ReadDoc (buf, tagInfo.docTag, doc, docLen, compatMode)) {
176
177 // give this document to each pass
178 for (pass = 0; pass < MAX_PASSES; pass++) {
179 if (Passes & (1 << pass)) {
180 if (PassData[pass].process (tagInfo, doc) == COMPERROR)
181 FatalError(1,"Error during processing of \"%s\"",PassData[pass].name);
182 }
183 }
184
185 // another document has been processed
186 numBytes += docLen;
187 numDocs++;
188 }
189
190 } while ((in_fd = OpenNextFile (in_fd)) > 0);
191
192
193 // do done for each pass
194 for (pass = 0; pass < MAX_PASSES; pass++) {
195 if (Passes & (1 << pass)) {
196 if (PassData[pass].done (tagInfo, file_name) == COMPERROR)
197 FatalError (1, "Error during done of \"%s\"", PassData[pass].name);
198 }
199 }
200}
201
202
203
204int main (int argc, char **argv) {
205 int ch, in_fd;
206 char *filename = NULL;
207 bool compatMode = false;
208 TagInfo tagInfo;
209 tagInfo.SetDocTag ("Document");
210
211 msg_prefix = argv[0];
212
213 opterr = 0;
214 while ((ch=getopt(argc, argv, "J:K:L:f:d:m:I:T:SCh"))!=-1){
215 switch (ch) {
216 case 'J':
217 tagInfo.SetDocTag (optarg);
218 break;
219 case 'K':
220 tagInfo.AddLevelTag (optarg);
221 break;
222 case 'L':
223 tagInfo.SetIndexLevel (optarg);
224 break;
225 case 'f':
226 filename = optarg;
227 break;
228 case 'd':
229 set_basepath (optarg);
230 break;
231 case 'm':
232 invf_buffer_size = (int) (atof (optarg) * 1024 * 1024);
233 break;
234 case 'I':
235 if (*optarg == '1')
236 Passes |= IVF_PASS_1;
237 else if (*optarg == '2')
238 Passes |= IVF_PASS_2;
239 else
240 Usage ("Invalid pass number");
241 break;
242 case 'T':
243 if (*optarg == '1')
244 Passes |= TEXT_PASS_1;
245 else if (*optarg == '2')
246 Passes |= TEXT_PASS_2;
247 else
248 Usage ("Invalid pass number");
249 break;
250 case 'S':
251 Passes |= SPECIAL;
252 break;
253 case 'C':
254 compatMode = true;
255 break;
256 case 'h':
257 case '?':
258 Usage (NULL);
259 }
260 }
261
262 if (!filename || *filename == '\0')
263 FatalError (1, "A document collection name must be specified.");
264
265 if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2))
266 FatalError (1, "I1 and I2 cannot be done simultaneously.");
267
268 if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2))
269 FatalError (1, "T1 and T2 cannot be done simultaneously.");
270
271 if (!Passes)
272 FatalError (1, "S, T1, T2, I1 or I2 must be specified.");
273
274 if (optind < argc) {
275 if ((in_fd = open (argv[optind], O_RDONLY)) == -1)
276 FatalError (1, "Cannot open %s", argv[optind]);
277 files = &argv[optind + 1];
278 num_files = argc - (optind + 1);
279
280 } else in_fd = 0; // stdin
281
282
283 if (compatMode) tagInfo.SetDocTag ("Document");
284
285 // a document tag is also a level tag
286 tagInfo.levelTags.insert (tagInfo.docTag);
287
288 Driver (in_fd, filename, tagInfo, compatMode);
289
290 exit (0);
291}
Note: See TracBrowser for help on using the repository browser.