source: indexers/trunk/mgpp/text/mgpp_passes_4jni.cpp@ 19822

Last change on this file since 19822 was 19822, checked in by mdewsnip, 15 years ago

Commented out all occurrences of

#define _XOPEN_SOURCE_EXTENDED 1

This was allegedly added for compilation on Solaris, but it just causes errors for me (on the NLNZ Solaris machines).

  • Property svn:keywords set to Author Date Id Revision
File size: 8.0 KB
Line 
1/**************************************************************************
2 *
3 * mgpp_passes.cpp -- Driver for the various passes
4 * Copyright (C) 1994 Neil Sharman
5 * Copyright (C) 1999 Rodger McNab
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 **************************************************************************/
22
23#define _XOPEN_SOURCE 1
24// This was added for Solaris, but it makes things worse on Solaris for me...
25// #define _XOPEN_SOURCE_EXTENDED 1
26
27#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__)
28#pragma warning(disable:4786)
29#endif
30
31// need this to avoid bizarre compiler problems under VC++ 6.0
32#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
33# include <iostream>
34#endif
35
36#include "sysfuncs.h"
37
38#ifdef HAVE_MALLINFO
39# include <malloc.h>
40#endif
41
42#if defined __WIN32__
43# include <io.h>
44# include "getopt_old.h"
45# define close _close
46# define open _open
47#elif defined __CYGWIN__
48#include "getopt_old.h"
49#else
50# include <unistd.h>
51#endif
52
53#include "memlib.h"
54#include "messages.h"
55#include "longlong.h"
56#include "mg_files.h"
57#include "mg.h"
58#include "build.h"
59#include "text.h"
60#include "stemmer.h"
61#include "FileBuf.h"
62#include "TextEl.h"
63#include "TagInfo.h"
64
65#include "mgpp_passes_4jni.h"
66
67#define MAX_PASSES 5
68
69#define SPECIAL 1
70#define TEXT_PASS_1 2
71#define TEXT_PASS_2 4
72#define IVF_PASS_1 8
73#define IVF_PASS_2 16
74
75#define MIN_BUF 8192
76
77
78unsigned long invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */
79
80static char Passes = 0;
81static char **files = NULL;
82static int num_files = 0;
83
84TagInfo tagInfo;
85char *filename = NULL;
86
87unsigned long numBytes = 0;
88unsigned long numDocs = 0;
89int mgpp_passes_exit_value = 0;
90
91struct pass_data {
92 char *name;
93 int (*init) (const TagInfo &tagInfo, char *);
94 int (*process) (const TagInfo &tagInfo, const TextElArray &doc);
95 int (*done) (const TagInfo &tagInfo, char *);
96};
97
98
99static pass_data PassData[MAX_PASSES] = {
100 {"special", init_special, process_special, done_special},
101 {"text.pass1", init_text_1, process_text_1, done_text_1},
102 {"text.pass2", init_text_2, process_text_2, done_text_2},
103 {"ivf.pass1", init_ivf_1, process_ivf_1, done_ivf_1},
104 {"ivf.pass2", init_ivf_2, process_ivf_2, done_ivf_2},
105};
106
107
108/* clear all the settings from one mgpp_passes run to the next */
109void clear_variables() {
110 tagInfo.Clear();
111 tagInfo.SetDocTag ("Document");
112 Passes = 0;
113 invf_buffer_size = 5 * 1024 * 1024;
114 numBytes = 0;
115 numDocs = 0;
116 mgpp_passes_exit_value = 0;
117}
118
119/* ################################################## */
120/* the following are methods to set all the variables that used to be
121 set by command line args */
122
123/* -S, -T1, -T2, -I1, -I2, args to mg_passes */
124void add_pass (char pass_type, char pass_num) {
125
126 switch(pass_type) {
127 case 'S':
128 Passes |= SPECIAL;
129 break;
130 case 'I':
131 if (pass_num == '1')
132 Passes |= IVF_PASS_1;
133 else if (pass_num == '2')
134 Passes |= IVF_PASS_2;
135 else
136 fprintf(stderr, "Invalid pass number %c for pass type %c\n", pass_num, pass_type);
137 break;
138 case 'T':
139 if (pass_num == '1')
140 Passes |= TEXT_PASS_1;
141 else if (pass_num == '2')
142 Passes |= TEXT_PASS_2;
143 else
144 fprintf(stderr, "Invalid pass number %c for pass type %c\n", pass_num, pass_type);
145 break;
146 }
147}
148
149/* -m arg to mgpp_passes */
150void set_inversion_limit(int limit) {
151 invf_buffer_size = limit * 1024 * 1024;
152}
153
154/* -J arg to mgpp_passes */
155void set_document_tag(const char *tag_name) {
156 tagInfo.SetDocTag (tag_name);
157
158 // a doc tag is also a level tag
159 tagInfo.AddLevelTag(tag_name);
160
161}
162
163/* -K arg to mgpp_passes */
164void add_level_tag(const char *tag_nam) {
165 tagInfo.AddLevelTag(tag_nam);
166}
167
168/* -L arg to mgpp_passes */
169void set_index_level(const char *tag_name) {
170 tagInfo.SetIndexLevel(tag_name);
171}
172
173// is this enough???
174/* -f arg to mgpp_passes */
175void set_filename(const char * filen) {
176 if (filename) {
177 Xfree (filename);
178 filename = NULL;
179 }
180 filename = Xstrdup (filen);
181}
182
183
184/* ############################################### */
185/* The old driver method has been split into 3:
186init_driver, process_document (called numdocs times),
187finalise_driver.
188The above set vars methods should all be called before init_driver.
189*/
190
191void init_driver () {
192
193 if (!filename || *filename == '\0') {
194 mgpp_passes_exit_value = 1;
195 FatalError (1, "A document collection name must be specified.");
196 }
197 if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2)) {
198 mgpp_passes_exit_value = 1;
199 FatalError (1, "I1 and I2 cannot be done simultaneously.");
200 }
201 if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2)) {
202 mgpp_passes_exit_value = 1;
203 FatalError (1, "T1 and T2 cannot be done simultaneously.");
204 }
205 if (!Passes) {
206 mgpp_passes_exit_value = 1;
207 FatalError (1, "S, T1, T2, I1 or I2 must be specified.");
208 }
209
210 // initialise all the passes
211 for (int pass = 0; pass < MAX_PASSES; pass++) {
212 if (Passes & (1 << pass)) {
213 if (PassData[pass].init (tagInfo, filename) == COMPERROR) {
214 mgpp_passes_exit_value = 1;
215 FatalError (1, "Error during init of \"%s\"", PassData[pass].name);
216 }
217 }
218 }
219
220}
221
222void process_document(u_char *buffer, int len) {
223
224 TextElArray doc;
225 unsigned long doc_len = 0;
226
227 while(ReadDoc(&buffer, tagInfo.docTag, doc, doc_len, false)) {
228 // give this document to each pass
229 for (int pass = 0; pass < MAX_PASSES; pass++) {
230 if (Passes & (1 << pass)) {
231 if (PassData[pass].process (tagInfo, doc) == COMPERROR) {
232 mgpp_passes_exit_value = 1;
233 FatalError(1,"Error during processing of \"%s\"",PassData[pass].name);
234 }
235 }
236 }
237
238 // another document has been processed
239 numBytes += doc_len;
240 numDocs++;
241 cout << "doc_len = "<<doc_len<<endl;
242 }
243}
244
245void finalise_driver () {
246
247 // do done for each pass
248 for (int pass = 0; pass < MAX_PASSES; pass++) {
249 if (Passes & (1 << pass)) {
250 if (PassData[pass].done (tagInfo, filename) == COMPERROR) {
251 mgpp_passes_exit_value = 1;
252 FatalError (1, "Error during done of \"%s\"", PassData[pass].name);
253 }
254 }
255 }
256}
257
258int get_exit_value() {
259 return mgpp_passes_exit_value;
260}
261 /*
262static void Driver (int in_fd, char *file_name,
263 const TagInfo &tagInfo, bool compatMode) {
264 // cout << tagInfo;
265
266 int pass;
267
268 unsigned long numBytes = 0;
269 unsigned long numDocs = 0;
270
271 // initialise all the passes
272 for (pass = 0; pass < MAX_PASSES; pass++) {
273 if (Passes & (1 << pass)) {
274 if (PassData[pass].init (tagInfo, file_name) == COMPERROR)
275 FatalError (1, "Error during init of \"%s\"", PassData[pass].name);
276 }
277 }
278
279
280 // set up various variables
281 FileBuf buf;
282 TextElArray doc;
283 unsigned long docLen = 0;
284
285 // read and process each file (start with an open file)
286 do {
287
288 // read and process each document in this file
289 buf.SetFD (in_fd);
290 while (ReadDoc (buf, tagInfo.docTag, doc, docLen, compatMode)) {
291
292 // give this document to each pass
293 for (pass = 0; pass < MAX_PASSES; pass++) {
294 if (Passes & (1 << pass)) {
295 if (PassData[pass].process (tagInfo, doc) == COMPERROR)
296 FatalError(1,"Error during processing of \"%s\"",PassData[pass].name);
297 }
298 }
299
300 // another document has been processed
301 numBytes += docLen;
302 numDocs++;
303 }
304
305 } while ((in_fd = OpenNextFile (in_fd)) > 0);
306
307
308 // do done for each pass
309 for (pass = 0; pass < MAX_PASSES; pass++) {
310 if (Passes & (1 << pass)) {
311 if (PassData[pass].done (tagInfo, file_name) == COMPERROR)
312 FatalError (1, "Error during done of \"%s\"", PassData[pass].name);
313 }
314 }
315}
316
317 */
Note: See TracBrowser for help on using the repository browser.