source: indexers/trunk/mgpp/text/mgpp_passes.cpp@ 19822

Last change on this file since 19822 was 19822, checked in by mdewsnip, 15 years ago

Commented out all occurrences of

#define _XOPEN_SOURCE_EXTENDED 1

This was allegedly added for compilation on Solaris, but it just causes errors for me (on the NLNZ Solaris machines).

  • Property svn:keywords set to Author Date Id Revision
File size: 6.8 KB
Line 
1/**************************************************************************
2 *
3 * mgpp_passes.cpp -- Driver for the various passes
4 * Copyright (C) 1994 Neil Sharman
5 * Copyright (C) 1999 Rodger McNab
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 **************************************************************************/
22
23#define _XOPEN_SOURCE 1
24// This was added for Solaris, but it makes things worse on Solaris for me...
25// #define _XOPEN_SOURCE_EXTENDED 1
26
27#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__)
28#pragma warning(disable:4786)
29#endif
30
31// need this to avoid bizarre compiler problems under VC++ 6.0
32#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
33# include <iostream>
34#endif
35
36#include "sysfuncs.h"
37
38#ifdef HAVE_MALLINFO
39# include <malloc.h>
40#endif
41
42#if defined __WIN32__
43# include <io.h>
44# include "getopt_old.h"
45# define close _close
46# define open _open
47#elif defined __CYGWIN__
48#include "getopt_old.h"
49#else
50# include <unistd.h>
51#endif
52
53#include "memlib.h"
54#include "messages.h"
55#include "longlong.h"
56#include "mg_files.h"
57#include "mg.h"
58#include "build.h"
59#include "text.h"
60#include "stemmer.h"
61#include "FileBuf.h"
62#include "TextEl.h"
63#include "TagInfo.h"
64#include "words.h"
65#define MAX_PASSES 5
66
67#define SPECIAL 1
68#define TEXT_PASS_1 2
69#define TEXT_PASS_2 4
70#define IVF_PASS_1 8
71#define IVF_PASS_2 16
72
73#define MIN_BUF 8192
74
75
76unsigned long invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */
77
78static char Passes = 0;
79static char **files = NULL;
80static int num_files = 0;
81
82
83struct pass_data {
84 char *name;
85 int (*init) (const TagInfo &tagInfo, char *);
86 int (*process) (const TagInfo &tagInfo, const TextElArray &doc);
87 int (*done) (const TagInfo &tagInfo, char *);
88};
89
90
91static pass_data PassData[MAX_PASSES] = {
92 {"special", init_special, process_special, done_special},
93 {"text.pass1", init_text_1, process_text_1, done_text_1},
94 {"text.pass2", init_text_2, process_text_2, done_text_2},
95 {"ivf.pass1", init_ivf_1, process_ivf_1, done_ivf_1},
96 {"ivf.pass2", init_ivf_2, process_ivf_2, done_ivf_2},
97};
98
99static char *usage_str = "\nUSAGE:\n"
100" %s [-J doc-tag] [-K level-tag] [-L index-level]\n"
101" %*s [-m invf-memory] [-T1] [-T2] [-I1] [-I2] [-S]\n"
102" %*s [-C] [-h] [-d directory] [-M maxnumeric] -f name\n\n";
103
104
105
106static void Usage (char *err) {
107 if (err) Message (err);
108 fprintf (stderr, usage_str, msg_prefix, strlen (msg_prefix), "",
109 strlen (msg_prefix), "");
110 exit (1);
111}
112
113
114
115
116int OpenNextFile (int in_fd) {
117 if (in_fd > 0) close (in_fd);
118 if (num_files == 0) return (-1);
119 if ((in_fd = open (files[0], O_RDONLY)) == -1)
120 FatalError (1, "Cannot open %s", files[0]);
121 ++files;
122 --num_files;
123 return (in_fd);
124}
125
126
127static void Driver (int in_fd, char *file_name,
128 const TagInfo &tagInfo, bool compatMode) {
129 // cout << tagInfo;
130
131 int pass;
132
133 unsigned long numBytes = 0;
134 unsigned long numDocs = 0;
135
136 // initialise all the passes
137 for (pass = 0; pass < MAX_PASSES; ++pass) {
138 if (Passes & (1 << pass)) {
139 if (PassData[pass].init (tagInfo, file_name) == COMPERROR)
140 FatalError (1, "Error during init of \"%s\"", PassData[pass].name);
141 }
142 }
143
144
145 // set up various variables
146 FileBuf buf;
147 TextElArray doc;
148 unsigned long docLen = 0;
149
150 // read and process each file (start with an open file)
151 do {
152
153 // read and process each document in this file
154 buf.SetFD (in_fd);
155 while (ReadDoc (buf, tagInfo.docTag, doc, docLen, compatMode)) {
156
157 // give this document to each pass
158 for (pass = 0; pass < MAX_PASSES; ++pass) {
159 if (Passes & (1 << pass)) {
160 if (PassData[pass].process (tagInfo, doc) == COMPERROR)
161 FatalError(1,"Error during processing of \"%s\"",PassData[pass].name);
162 }
163 }
164
165 // another document has been processed
166 numBytes += docLen;
167 ++numDocs;
168 }
169
170 } while ((in_fd = OpenNextFile (in_fd)) > 0);
171
172
173 // do done for each pass
174 for (pass = 0; pass < MAX_PASSES; ++pass) {
175 if (Passes & (1 << pass)) {
176 if (PassData[pass].done (tagInfo, file_name) == COMPERROR)
177 FatalError (1, "Error during done of \"%s\"", PassData[pass].name);
178 }
179 }
180}
181
182int main (int argc, char **argv) {
183 int ch, in_fd, maxnum;
184 char *filename = NULL;
185 bool compatMode = false;
186 TagInfo tagInfo;
187 tagInfo.SetDocTag ("Document");
188
189 msg_prefix = argv[0];
190
191 opterr = 0;
192 while ((ch=getopt(argc, argv, "J:K:L:M:f:d:m:I:T:SCh"))!=-1){
193 switch (ch) {
194 case 'J':
195 tagInfo.SetDocTag (optarg);
196 break;
197 case 'K':
198 tagInfo.AddLevelTag (optarg);
199 break;
200 case 'L':
201 tagInfo.SetIndexLevel (optarg);
202 break;
203 case 'M':
204 maxnum = atoi(optarg);
205 if (maxnum > 4 && maxnum < 512) {
206 MAXNUMERIC = maxnum;
207 }
208 break;
209 case 'f':
210 filename = optarg;
211 break;
212 case 'd':
213 set_basepath (optarg);
214 break;
215 case 'm':
216 invf_buffer_size = (int) (atof (optarg) * 1024 * 1024);
217 break;
218 case 'I':
219 if (*optarg == '1')
220 Passes |= IVF_PASS_1;
221 else if (*optarg == '2')
222 Passes |= IVF_PASS_2;
223 else
224 Usage ("Invalid pass number");
225 break;
226 case 'T':
227 if (*optarg == '1')
228 Passes |= TEXT_PASS_1;
229 else if (*optarg == '2')
230 Passes |= TEXT_PASS_2;
231 else
232 Usage ("Invalid pass number");
233 break;
234 case 'S':
235 Passes |= SPECIAL;
236 break;
237 case 'C':
238 compatMode = true;
239 break;
240 case 'h':
241 case '?':
242 Usage (NULL);
243 }
244 }
245
246 if (!filename || *filename == '\0')
247 FatalError (1, "A document collection name must be specified.");
248
249 if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2))
250 FatalError (1, "I1 and I2 cannot be done simultaneously.");
251
252 if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2))
253 FatalError (1, "T1 and T2 cannot be done simultaneously.");
254
255 if (!Passes)
256 FatalError (1, "S, T1, T2, I1 or I2 must be specified.");
257
258 if (optind < argc) {
259 if ((in_fd = open (argv[optind], O_RDONLY)) == -1)
260 FatalError (1, "Cannot open %s", argv[optind]);
261 files = &argv[optind + 1];
262 num_files = argc - (optind + 1);
263
264 } else in_fd = 0; // stdin
265
266
267 if (compatMode) tagInfo.SetDocTag ("Document");
268
269 // a document tag is also a level tag
270 tagInfo.levelTags.insert (tagInfo.docTag);
271
272 Driver (in_fd, filename, tagInfo, compatMode);
273
274 return (0);
275}
Note: See TracBrowser for help on using the repository browser.