source: trunk/indexers/mgpp/text/mgpp_passes_4jni.cpp@ 8948

Last change on this file since 8948 was 8948, checked in by kjdon, 19 years ago

first stab at jni version of mgpp_passes

  • Property svn:keywords set to Author Date Id Revision
File size: 8.0 KB
Line 
1/**************************************************************************
2 *
3 * mgpp_passes.cpp -- Driver for the various passes
4 * Copyright (C) 1994 Neil Sharman
5 * Copyright (C) 1999 Rodger McNab
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 **************************************************************************/
22
23#define _XOPEN_SOURCE 1
24#define _XOPEN_SOURCE_EXTENDED 1
25
26#if defined(WIN32) || defined(_WIN32) || defined(__WIN32__)
27#pragma warning(disable:4786)
28#endif
29
30// need this to avoid bizarre compiler problems under VC++ 6.0
31#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
32# include <iostream>
33#endif
34
35#include "sysfuncs.h"
36
37#ifdef HAVE_MALLINFO
38# include <malloc.h>
39#endif
40
41#if defined __WIN32__
42# include <io.h>
43# include "getopt_old.h"
44# define close _close
45# define open _open
46#elif defined __CYGWIN__
47#include "getopt_old.h"
48#else
49# include <unistd.h>
50#endif
51
52#include "memlib.h"
53#include "messages.h"
54#include "longlong.h"
55#include "mg_files.h"
56#include "mg.h"
57#include "build.h"
58#include "text.h"
59#include "stemmer.h"
60#include "FileBuf.h"
61#include "TextEl.h"
62#include "TagInfo.h"
63
64#include "mgpp_passes_4jni.h"
65
66#define MAX_PASSES 5
67
68#define SPECIAL 1
69#define TEXT_PASS_1 2
70#define TEXT_PASS_2 4
71#define IVF_PASS_1 8
72#define IVF_PASS_2 16
73
74#define MIN_BUF 8192
75
76
77unsigned long invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */
78
79static char Passes = 0;
80static char **files = NULL;
81static int num_files = 0;
82
83TagInfo tagInfo;
84char *filename = NULL;
85
86unsigned long numBytes = 0;
87unsigned long numDocs = 0;
88int mgpp_passes_exit_value = 0;
89
90struct pass_data {
91 char *name;
92 int (*init) (const TagInfo &tagInfo, char *);
93 int (*process) (const TagInfo &tagInfo, const TextElArray &doc);
94 int (*done) (const TagInfo &tagInfo, char *);
95};
96
97
98static pass_data PassData[MAX_PASSES] = {
99 {"special", init_special, process_special, done_special},
100 {"text.pass1", init_text_1, process_text_1, done_text_1},
101 {"text.pass2", init_text_2, process_text_2, done_text_2},
102 {"ivf.pass1", init_ivf_1, process_ivf_1, done_ivf_1},
103 {"ivf.pass2", init_ivf_2, process_ivf_2, done_ivf_2},
104};
105
106
107/* clear all the settings from one mgpp_passes run to the next */
108void clear_variables() {
109 tagInfo.Clear();
110 tagInfo.SetDocTag ("Document");
111 Passes = 0;
112 invf_buffer_size = 5 * 1024 * 1024;
113 numBytes = 0;
114 numDocs = 0;
115 mgpp_passes_exit_value = 0;
116}
117
118/* ################################################## */
119/* the following are methods to set all the variables that used to be
120 set by command line args */
121
122/* -S, -T1, -T2, -I1, -I2, args to mg_passes */
123void add_pass (char pass_type, char pass_num) {
124
125 switch(pass_type) {
126 case 'S':
127 Passes |= SPECIAL;
128 break;
129 case 'I':
130 if (pass_num == '1')
131 Passes |= IVF_PASS_1;
132 else if (pass_num == '2')
133 Passes |= IVF_PASS_2;
134 else
135 fprintf(stderr, "Invalid pass number %c for pass type %c\n", pass_num, pass_type);
136 break;
137 case 'T':
138 if (pass_num == '1')
139 Passes |= TEXT_PASS_1;
140 else if (pass_num == '2')
141 Passes |= TEXT_PASS_2;
142 else
143 fprintf(stderr, "Invalid pass number %c for pass type %c\n", pass_num, pass_type);
144 break;
145 }
146}
147
148/* -m arg to mgpp_passes */
149void set_inversion_limit(int limit) {
150 invf_buffer_size = limit * 1024 * 1024;
151}
152
153/* -J arg to mgpp_passes */
154void set_document_tag(const char *tag_name) {
155 tagInfo.SetDocTag (tag_name);
156
157 // a doc tag is also a level tag
158 tagInfo.AddLevelTag(tag_name);
159
160}
161
162/* -K arg to mgpp_passes */
163void add_level_tag(const char *tag_nam) {
164 tagInfo.AddLevelTag(tag_nam);
165}
166
167/* -L arg to mgpp_passes */
168void set_index_level(const char *tag_name) {
169 tagInfo.SetIndexLevel(tag_name);
170}
171
172// is this enough???
173/* -f arg to mgpp_passes */
174void set_filename(const char * filen) {
175 if (filename) {
176 Xfree (filename);
177 filename = NULL;
178 }
179 filename = Xstrdup (filen);
180}
181
182
183/* ############################################### */
184/* The old driver method has been split into 3:
185init_driver, process_document (called numdocs times),
186finalise_driver.
187The above set vars methods should all be called before init_driver.
188*/
189
190void init_driver () {
191
192 if (!filename || *filename == '\0') {
193 mgpp_passes_exit_value = 1;
194 FatalError (1, "A document collection name must be specified.");
195 }
196 if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2)) {
197 mgpp_passes_exit_value = 1;
198 FatalError (1, "I1 and I2 cannot be done simultaneously.");
199 }
200 if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2)) {
201 mgpp_passes_exit_value = 1;
202 FatalError (1, "T1 and T2 cannot be done simultaneously.");
203 }
204 if (!Passes) {
205 mgpp_passes_exit_value = 1;
206 FatalError (1, "S, T1, T2, I1 or I2 must be specified.");
207 }
208
209 // initialise all the passes
210 for (int pass = 0; pass < MAX_PASSES; pass++) {
211 if (Passes & (1 << pass)) {
212 if (PassData[pass].init (tagInfo, filename) == COMPERROR) {
213 mgpp_passes_exit_value = 1;
214 FatalError (1, "Error during init of \"%s\"", PassData[pass].name);
215 }
216 }
217 }
218
219}
220
221void process_document(u_char *buffer, int len) {
222
223 TextElArray doc;
224 unsigned long doc_len = 0;
225
226 while(ReadDoc(&buffer, tagInfo.docTag, doc, doc_len, false)) {
227 // give this document to each pass
228 for (int pass = 0; pass < MAX_PASSES; pass++) {
229 if (Passes & (1 << pass)) {
230 if (PassData[pass].process (tagInfo, doc) == COMPERROR) {
231 mgpp_passes_exit_value = 1;
232 FatalError(1,"Error during processing of \"%s\"",PassData[pass].name);
233 }
234 }
235 }
236
237 // another document has been processed
238 numBytes += doc_len;
239 numDocs++;
240 cout << "doc_len = "<<doc_len<<endl;
241 }
242}
243
244void finalise_driver () {
245
246 // do done for each pass
247 for (int pass = 0; pass < MAX_PASSES; pass++) {
248 if (Passes & (1 << pass)) {
249 if (PassData[pass].done (tagInfo, filename) == COMPERROR) {
250 mgpp_passes_exit_value = 1;
251 FatalError (1, "Error during done of \"%s\"", PassData[pass].name);
252 }
253 }
254 }
255}
256
257int get_exit_value() {
258 return mgpp_passes_exit_value;
259}
260 /*
261static void Driver (int in_fd, char *file_name,
262 const TagInfo &tagInfo, bool compatMode) {
263 // cout << tagInfo;
264
265 int pass;
266
267 unsigned long numBytes = 0;
268 unsigned long numDocs = 0;
269
270 // initialise all the passes
271 for (pass = 0; pass < MAX_PASSES; pass++) {
272 if (Passes & (1 << pass)) {
273 if (PassData[pass].init (tagInfo, file_name) == COMPERROR)
274 FatalError (1, "Error during init of \"%s\"", PassData[pass].name);
275 }
276 }
277
278
279 // set up various variables
280 FileBuf buf;
281 TextElArray doc;
282 unsigned long docLen = 0;
283
284 // read and process each file (start with an open file)
285 do {
286
287 // read and process each document in this file
288 buf.SetFD (in_fd);
289 while (ReadDoc (buf, tagInfo.docTag, doc, docLen, compatMode)) {
290
291 // give this document to each pass
292 for (pass = 0; pass < MAX_PASSES; pass++) {
293 if (Passes & (1 << pass)) {
294 if (PassData[pass].process (tagInfo, doc) == COMPERROR)
295 FatalError(1,"Error during processing of \"%s\"",PassData[pass].name);
296 }
297 }
298
299 // another document has been processed
300 numBytes += docLen;
301 numDocs++;
302 }
303
304 } while ((in_fd = OpenNextFile (in_fd)) > 0);
305
306
307 // do done for each pass
308 for (pass = 0; pass < MAX_PASSES; pass++) {
309 if (Passes & (1 << pass)) {
310 if (PassData[pass].done (tagInfo, file_name) == COMPERROR)
311 FatalError (1, "Error during done of \"%s\"", PassData[pass].name);
312 }
313 }
314}
315
316 */
Note: See TracBrowser for help on using the repository browser.