source: gsdl/trunk/trunk/mgpp/text/mgpp_passes.cpp@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago

Undoing change commited in r16582

  • Property svn:keywords set to Author Date Id Revision
File size: 6.7 KB
Line 
1/**************************************************************************
2 *
3 * mgpp_passes.cpp -- Driver for the various passes
4 * Copyright (C) 1994 Neil Sharman
5 * Copyright (C) 1999 Rodger McNab
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 **************************************************************************/
22
23#define _XOPEN_SOURCE 1
24#define _XOPEN_SOURCE_EXTENDED 1
25
26#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__)
27#pragma warning(disable:4786)
28#endif
29
30// need this to avoid bizarre compiler problems under VC++ 6.0
31#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
32# include <iostream>
33#endif
34
35#include "sysfuncs.h"
36
37#ifdef HAVE_MALLINFO
38# include <malloc.h>
39#endif
40
41#if defined __WIN32__
42# include <io.h>
43# include "getopt_old.h"
44# define close _close
45# define open _open
46#elif defined __CYGWIN__
47#include "getopt_old.h"
48#else
49# include <unistd.h>
50#endif
51
52#include "memlib.h"
53#include "messages.h"
54#include "longlong.h"
55#include "mg_files.h"
56#include "mg.h"
57#include "build.h"
58#include "text.h"
59#include "stemmer.h"
60#include "FileBuf.h"
61#include "TextEl.h"
62#include "TagInfo.h"
63#include "words.h"
64#define MAX_PASSES 5
65
66#define SPECIAL 1
67#define TEXT_PASS_1 2
68#define TEXT_PASS_2 4
69#define IVF_PASS_1 8
70#define IVF_PASS_2 16
71
72#define MIN_BUF 8192
73
74
75unsigned long invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */
76
77static char Passes = 0;
78static char **files = NULL;
79static int num_files = 0;
80
81
82struct pass_data {
83 char *name;
84 int (*init) (const TagInfo &tagInfo, char *);
85 int (*process) (const TagInfo &tagInfo, const TextElArray &doc);
86 int (*done) (const TagInfo &tagInfo, char *);
87};
88
89
90static pass_data PassData[MAX_PASSES] = {
91 {"special", init_special, process_special, done_special},
92 {"text.pass1", init_text_1, process_text_1, done_text_1},
93 {"text.pass2", init_text_2, process_text_2, done_text_2},
94 {"ivf.pass1", init_ivf_1, process_ivf_1, done_ivf_1},
95 {"ivf.pass2", init_ivf_2, process_ivf_2, done_ivf_2},
96};
97
98static char *usage_str = "\nUSAGE:\n"
99" %s [-J doc-tag] [-K level-tag] [-L index-level]\n"
100" %*s [-m invf-memory] [-T1] [-T2] [-I1] [-I2] [-S]\n"
101" %*s [-C] [-h] [-d directory] [-M maxnumeric] -f name\n\n";
102
103
104
105static void Usage (char *err) {
106 if (err) Message (err);
107 fprintf (stderr, usage_str, msg_prefix, strlen (msg_prefix), "",
108 strlen (msg_prefix), "");
109 exit (1);
110}
111
112
113
114
115int OpenNextFile (int in_fd) {
116 if (in_fd > 0) close (in_fd);
117 if (num_files == 0) return (-1);
118 if ((in_fd = open (files[0], O_RDONLY)) == -1)
119 FatalError (1, "Cannot open %s", files[0]);
120 ++files;
121 --num_files;
122 return (in_fd);
123}
124
125
126static void Driver (int in_fd, char *file_name,
127 const TagInfo &tagInfo, bool compatMode) {
128 // cout << tagInfo;
129
130 int pass;
131
132 unsigned long numBytes = 0;
133 unsigned long numDocs = 0;
134
135 // initialise all the passes
136 for (pass = 0; pass < MAX_PASSES; ++pass) {
137 if (Passes & (1 << pass)) {
138 if (PassData[pass].init (tagInfo, file_name) == COMPERROR)
139 FatalError (1, "Error during init of \"%s\"", PassData[pass].name);
140 }
141 }
142
143
144 // set up various variables
145 FileBuf buf;
146 TextElArray doc;
147 unsigned long docLen = 0;
148
149 // read and process each file (start with an open file)
150 do {
151
152 // read and process each document in this file
153 buf.SetFD (in_fd);
154 while (ReadDoc (buf, tagInfo.docTag, doc, docLen, compatMode)) {
155
156 // give this document to each pass
157 for (pass = 0; pass < MAX_PASSES; ++pass) {
158 if (Passes & (1 << pass)) {
159 if (PassData[pass].process (tagInfo, doc) == COMPERROR)
160 FatalError(1,"Error during processing of \"%s\"",PassData[pass].name);
161 }
162 }
163
164 // another document has been processed
165 numBytes += docLen;
166 ++numDocs;
167 }
168
169 } while ((in_fd = OpenNextFile (in_fd)) > 0);
170
171
172 // do done for each pass
173 for (pass = 0; pass < MAX_PASSES; ++pass) {
174 if (Passes & (1 << pass)) {
175 if (PassData[pass].done (tagInfo, file_name) == COMPERROR)
176 FatalError (1, "Error during done of \"%s\"", PassData[pass].name);
177 }
178 }
179}
180
181int main (int argc, char **argv) {
182 int ch, in_fd, maxnum;
183 char *filename = NULL;
184 bool compatMode = false;
185 TagInfo tagInfo;
186 tagInfo.SetDocTag ("Document");
187
188 msg_prefix = argv[0];
189
190 opterr = 0;
191 while ((ch=getopt(argc, argv, "J:K:L:M:f:d:m:I:T:SCh"))!=-1){
192 switch (ch) {
193 case 'J':
194 tagInfo.SetDocTag (optarg);
195 break;
196 case 'K':
197 tagInfo.AddLevelTag (optarg);
198 break;
199 case 'L':
200 tagInfo.SetIndexLevel (optarg);
201 break;
202 case 'M':
203 maxnum = atoi(optarg);
204 if (maxnum > 4 && maxnum < 512) {
205 MAXNUMERIC = maxnum;
206 }
207 break;
208 case 'f':
209 filename = optarg;
210 break;
211 case 'd':
212 set_basepath (optarg);
213 break;
214 case 'm':
215 invf_buffer_size = (int) (atof (optarg) * 1024 * 1024);
216 break;
217 case 'I':
218 if (*optarg == '1')
219 Passes |= IVF_PASS_1;
220 else if (*optarg == '2')
221 Passes |= IVF_PASS_2;
222 else
223 Usage ("Invalid pass number");
224 break;
225 case 'T':
226 if (*optarg == '1')
227 Passes |= TEXT_PASS_1;
228 else if (*optarg == '2')
229 Passes |= TEXT_PASS_2;
230 else
231 Usage ("Invalid pass number");
232 break;
233 case 'S':
234 Passes |= SPECIAL;
235 break;
236 case 'C':
237 compatMode = true;
238 break;
239 case 'h':
240 case '?':
241 Usage (NULL);
242 }
243 }
244
245 if (!filename || *filename == '\0')
246 FatalError (1, "A document collection name must be specified.");
247
248 if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2))
249 FatalError (1, "I1 and I2 cannot be done simultaneously.");
250
251 if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2))
252 FatalError (1, "T1 and T2 cannot be done simultaneously.");
253
254 if (!Passes)
255 FatalError (1, "S, T1, T2, I1 or I2 must be specified.");
256
257 if (optind < argc) {
258 if ((in_fd = open (argv[optind], O_RDONLY)) == -1)
259 FatalError (1, "Cannot open %s", argv[optind]);
260 files = &argv[optind + 1];
261 num_files = argc - (optind + 1);
262
263 } else in_fd = 0; // stdin
264
265
266 if (compatMode) tagInfo.SetDocTag ("Document");
267
268 // a document tag is also a level tag
269 tagInfo.levelTags.insert (tagInfo.docTag);
270
271 Driver (in_fd, filename, tagInfo, compatMode);
272
273 return (0);
274}
Note: See TracBrowser for help on using the repository browser.