source: trunk/indexers/mgpp/text/mgpp_passes.cpp@ 3365

Last change on this file since 3365 was 3365, checked in by kjdon, 22 years ago

Initial revision

  • Property svn:keywords set to Author Date Id Revision
File size: 6.5 KB
Line 
1/**************************************************************************
2 *
3 * mgpp_passes.cpp -- Driver for the various passes
4 * Copyright (C) 1994 Neil Sharman
5 * Copyright (C) 1999 Rodger McNab
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 **************************************************************************/
22
23#define _XOPEN_SOURCE 1
24#define _XOPEN_SOURCE_EXTENDED 1
25
26// need this to avoid bizarre compiler problems under VC++ 6.0
27#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
28# include <iostream>
29#endif
30
31#include "sysfuncs.h"
32
33#ifdef HAVE_MALLINFO
34# include <malloc.h>
35#endif
36
37#if defined __WIN32__
38# include <io.h>
39# include "getopt_old.h"
40# define close _close
41# define open _open
42#elif defined __CYGWIN__
43#include "getopt_old.h"
44#else
45# include <unistd.h>
46#endif
47
48#include "memlib.h"
49#include "messages.h"
50#include "longlong.h"
51#include "mg_files.h"
52#include "mg.h"
53#include "build.h"
54#include "text.h"
55#include "stemmer.h"
56#include "FileBuf.h"
57#include "TextEl.h"
58#include "TagInfo.h"
59
60#define MAX_PASSES 5
61
62#define SPECIAL 1
63#define TEXT_PASS_1 2
64#define TEXT_PASS_2 4
65#define IVF_PASS_1 8
66#define IVF_PASS_2 16
67
68#define MIN_BUF 8192
69
70
71unsigned long invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */
72
73static char Passes = 0;
74static char **files = NULL;
75static int num_files = 0;
76
77
78struct pass_data {
79 char *name;
80 int (*init) (const TagInfo &tagInfo, char *);
81 int (*process) (const TagInfo &tagInfo, const TextElArray &doc);
82 int (*done) (const TagInfo &tagInfo, char *);
83};
84
85
86static pass_data PassData[MAX_PASSES] = {
87 {"special", init_special, process_special, done_special},
88 {"text.pass1", init_text_1, process_text_1, done_text_1},
89 {"text.pass2", init_text_2, process_text_2, done_text_2},
90 {"ivf.pass1", init_ivf_1, process_ivf_1, done_ivf_1},
91 {"ivf.pass2", init_ivf_2, process_ivf_2, done_ivf_2},
92};
93
94static char *usage_str = "\nUSAGE:\n"
95" %s [-J doc-tag] [-K level-tag] [-L index-level]\n"
96" %*s [-m invf-memory] [-T1] [-T2] [-I1] [-I2] [-S]\n"
97" %*s [-C] [-h] [-d directory] -f name\n\n";
98
99
100
101static void Usage (char *err) {
102 if (err) Message (err);
103 fprintf (stderr, usage_str, msg_prefix, strlen (msg_prefix), "",
104 strlen (msg_prefix), "");
105 exit (1);
106}
107
108
109
110
111int OpenNextFile (int in_fd) {
112 if (in_fd > 0) close (in_fd);
113 if (num_files == 0) return (-1);
114 if ((in_fd = open (files[0], O_RDONLY)) == -1)
115 FatalError (1, "Cannot open %s", files[0]);
116 files++;
117 num_files--;
118 return (in_fd);
119}
120
121
122static void Driver (int in_fd, char *file_name,
123 const TagInfo &tagInfo, bool compatMode) {
124 // cout << tagInfo;
125
126 int pass;
127
128 unsigned long numBytes = 0;
129 unsigned long numDocs = 0;
130
131 // initialise all the passes
132 for (pass = 0; pass < MAX_PASSES; pass++) {
133 if (Passes & (1 << pass)) {
134 if (PassData[pass].init (tagInfo, file_name) == COMPERROR)
135 FatalError (1, "Error during init of \"%s\"", PassData[pass].name);
136 }
137 }
138
139
140 // set up various variables
141 FileBuf buf;
142 TextElArray doc;
143 unsigned long docLen = 0;
144
145 // read and process each file (start with an open file)
146 do {
147
148 // read and process each document in this file
149 buf.SetFD (in_fd);
150 while (ReadDoc (buf, tagInfo.docTag, doc, docLen, compatMode)) {
151
152 // give this document to each pass
153 for (pass = 0; pass < MAX_PASSES; pass++) {
154 if (Passes & (1 << pass)) {
155 if (PassData[pass].process (tagInfo, doc) == COMPERROR)
156 FatalError(1,"Error during processing of \"%s\"",PassData[pass].name);
157 }
158 }
159
160 // another document has been processed
161 numBytes += docLen;
162 numDocs++;
163 }
164
165 } while ((in_fd = OpenNextFile (in_fd)) > 0);
166
167
168 // do done for each pass
169 for (pass = 0; pass < MAX_PASSES; pass++) {
170 if (Passes & (1 << pass)) {
171 if (PassData[pass].done (tagInfo, file_name) == COMPERROR)
172 FatalError (1, "Error during done of \"%s\"", PassData[pass].name);
173 }
174 }
175}
176
177int main (int argc, char **argv) {
178 int ch, in_fd;
179 char *filename = NULL;
180 bool compatMode = false;
181 TagInfo tagInfo;
182 tagInfo.SetDocTag ("Document");
183
184 msg_prefix = argv[0];
185
186 opterr = 0;
187 while ((ch=getopt(argc, argv, "J:K:L:f:d:m:I:T:SCh"))!=-1){
188 switch (ch) {
189 case 'J':
190 tagInfo.SetDocTag (optarg);
191 break;
192 case 'K':
193 tagInfo.AddLevelTag (optarg);
194 break;
195 case 'L':
196 tagInfo.SetIndexLevel (optarg);
197 break;
198 case 'f':
199 filename = optarg;
200 break;
201 case 'd':
202 set_basepath (optarg);
203 break;
204 case 'm':
205 invf_buffer_size = (int) (atof (optarg) * 1024 * 1024);
206 break;
207 case 'I':
208 if (*optarg == '1')
209 Passes |= IVF_PASS_1;
210 else if (*optarg == '2')
211 Passes |= IVF_PASS_2;
212 else
213 Usage ("Invalid pass number");
214 break;
215 case 'T':
216 if (*optarg == '1')
217 Passes |= TEXT_PASS_1;
218 else if (*optarg == '2')
219 Passes |= TEXT_PASS_2;
220 else
221 Usage ("Invalid pass number");
222 break;
223 case 'S':
224 Passes |= SPECIAL;
225 break;
226 case 'C':
227 compatMode = true;
228 break;
229 case 'h':
230 case '?':
231 Usage (NULL);
232 }
233 }
234
235 if (!filename || *filename == '\0')
236 FatalError (1, "A document collection name must be specified.");
237
238 if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2))
239 FatalError (1, "I1 and I2 cannot be done simultaneously.");
240
241 if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2))
242 FatalError (1, "T1 and T2 cannot be done simultaneously.");
243
244 if (!Passes)
245 FatalError (1, "S, T1, T2, I1 or I2 must be specified.");
246
247 if (optind < argc) {
248 if ((in_fd = open (argv[optind], O_RDONLY)) == -1)
249 FatalError (1, "Cannot open %s", argv[optind]);
250 files = &argv[optind + 1];
251 num_files = argc - (optind + 1);
252
253 } else in_fd = 0; // stdin
254
255
256 if (compatMode) tagInfo.SetDocTag ("Document");
257
258 // a document tag is also a level tag
259 tagInfo.levelTags.insert (tagInfo.docTag);
260
261 Driver (in_fd, filename, tagInfo, compatMode);
262
263 return (0);
264}
Note: See TracBrowser for help on using the repository browser.