source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper4/GS2LuceneIndexer.java@ 29148

Last change on this file since 29148 was 29148, checked in by ak19, 10 years ago

Part of port from lucene3.3.0 to lucene4.7.2. Related to LuceneWrapper. 1. Updating the lucene-gs makefiles to allow compiling up Lucene4Wrapper.jar or Lucene3Wrapper.jar. Only the Linux Makefile.in has been tested so far. 2. Adding in the jar files necessary for Lucene4Wrapper into the lib folder's new lucene4 subfolder. 3. Updating the Lucene src code to use lucene4.7.2 instead of lucene3.3.0.

  • Property svn:executable set to *
File size: 12.2 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneIndexer.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26
27package org.greenstone.LuceneWrapper4;
28
29
30import java.io.*;
31import java.util.Vector;
32
33import org.xml.sax.Attributes;
34import org.xml.sax.helpers.DefaultHandler;
35import org.xml.sax.InputSource;
36import org.xml.sax.SAXException;
37import org.xml.sax.XMLReader;
38
39import javax.xml.parsers.SAXParser;
40import javax.xml.parsers.SAXParserFactory;
41
42import org.apache.lucene.document.Document;
43import org.apache.lucene.document.Field;
44import org.apache.lucene.index.IndexWriter;
45import org.apache.lucene.index.Term;
46import org.apache.lucene.analysis.Analyzer;
47
48
49import org.apache.lucene.util.Version;
50
51import java.util.Stack;
52import java.io.FileInputStream;
53import java.io.File;
54import java.io.StringReader;
55import java.net.URL;
56
57
58/**
59 * class for indexing XML generated by lucenebuildproc.pm
60 */
61
62public class GS2LuceneIndexer {
63
64 protected static boolean debug = false;
65
66 protected static void debug(String message)
67 {
68 if (debug) {
69 System.err.println(message);
70 }
71 }
72
73
74 public static void main (String args[]) throws Exception
75 {
76 int verbosity = 1;
77 // Default is to edit the existing index
78 boolean create_new_index = false;
79
80 Vector filtered_args = new Vector();
81
82 int argc = args.length;
83 int i = 0;
84 while (i<argc) {
85 if (args[i].startsWith("-")) {
86
87 // -removeold causes the existing index to be overwritten
88 if (args[i].equals("-removeold")) {
89 create_new_index = true;
90 }
91
92 // -verbosity [num]
93 else if (args[i].equals("-verbosity")) {
94 i++;
95 if (i<argc) {
96 verbosity = Integer.parseInt(args[i]);
97 if (verbosity>=5) {
98 debug = true;
99 }
100 }
101 }
102 else if (args[i].equals("-debug")) {
103 debug = true;
104 }
105 else {
106 System.err.println("Unrecognised option: " + args[i]);
107 }
108 }
109 else {
110 filtered_args.add((Object)args[i]);
111 }
112 i++;
113 }
114
115 if (filtered_args.size() != 3) {
116 System.err.println("Usage: java GS2LuceneIndexer [-removeold|-verbosity [num]] doc-tag-level building_dir index");
117 return;
118 }
119
120 String doc_tag_level = (String)filtered_args.get(0);
121 String building_dirname = (String)filtered_args.get(1);
122 String index_dirname = (String)filtered_args.get(2);
123
124 String import_dirname = building_dirname + File.separator + "text";
125
126 File import_dir = new File(import_dirname);
127 File building_dir = new File(building_dirname);
128
129 if (!import_dir.exists()) {
130 System.err.println("Couldn't find import directory: "+import_dirname);
131 return;
132 }
133
134 File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator);
135 idx_dir.mkdir();
136
137 // Set up indexer
138 Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index);
139
140 // Read from stdin the files to process
141 try {
142 InputStreamReader isr = new InputStreamReader(System.in, "UTF-8");
143 BufferedReader brin = new BufferedReader(isr);
144
145 StringBuffer xml_text = new StringBuffer(1024);
146 String line = null;
147 while ((line = brin.readLine()) != null) {
148 xml_text.append(line);
149 xml_text.append(" ");
150
151 debug("Got line " + line);
152
153 if (line.endsWith("</Delete>")) {
154
155 indexer.delete(xml_text.toString());
156 xml_text = new StringBuffer(1024);
157 }
158 else if (line.startsWith("</Doc>")) {
159 indexer.index(xml_text.toString());
160 xml_text = new StringBuffer(1024);
161 }
162 }
163
164 brin.close();
165 isr.close();
166
167 } catch (IOException e) {
168 System.err.println("Error: unable to read from stdin");
169 e.printStackTrace();
170 }
171
172 indexer.finish();
173 }
174
175
176 static public class Indexer extends DefaultHandler
177 {
178 IndexWriter writer_ = null;
179 Analyzer analyzer_ = null;
180 SAXParser sax_parser_ = null;
181 String doc_tag_level_ = null;
182
183 Stack stack_ = null;
184 String path_ = "";
185
186 Document current_doc_ = null;
187 String current_node_ = "";
188 String current_doc_oid_ = "";
189 String indexable_current_node_ = "";
190 boolean tokenize = true;
191 String current_contents_ = "";
192
193 String mode_ = "";
194 protected String file_id_ = null;
195
196 /** pass in true if want to create a new index, false if want to use the existing one */
197 public Indexer (String doc_tag_level, File index_dir, boolean create)
198 {
199 doc_tag_level_ = doc_tag_level;
200
201 try {
202 stack_ = new Stack();
203 SAXParserFactory sax_factory = SAXParserFactory.newInstance();
204 sax_parser_ = sax_factory.newSAXParser();
205
206 XMLReader reader = sax_parser_.getXMLReader();
207 reader.setFeature("http://xml.org/sax/features/validation", false);
208
209 analyzer_ = new GS2Analyzer(); // uses build in stop_word_set
210 writer_ = GSLuceneUtil.getIndexWriter(index_dir.getPath(), analyzer_, create);
211 }
212 catch (Exception e) {
213 // We need to know if creating/opening the index fails
214 e.printStackTrace();
215 }
216 }
217
218 /** index one document */
219 public void index (String file_id, File file)
220 {
221 mode_ = "add";
222 file_id_ = file_id;
223 path_ = "";
224 String base_path = file.getPath();
225 base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
226
227 try {
228 sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
229 }
230 catch (Exception e) {
231 println("parse error:");
232 e.printStackTrace();
233 }
234 }
235
236 /** index one document stored as string*/
237 public void index (String xml_text)
238 {
239 mode_ = "add";
240 file_id_ = "<xml doc on stdin>";
241 path_ = "";
242
243 try {
244 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
245 }
246 catch (Exception e) {
247 println("parse error:");
248 e.printStackTrace();
249 }
250 }
251
252 /** delete one document, based on doc_id in <Delete>doc_id</Delete> */
253 public void delete(String xml_text)
254 {
255 mode_ = "delete";
256 file_id_ = "<delete doc>";
257 path_ = "";
258
259 try {
260 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
261 }
262 catch (Exception e) {
263 println("parse error:");
264 e.printStackTrace();
265 }
266 }
267
268 public void finish()
269 {
270 /** optimise the index */
271 try {
272 // writer_.optimize(); // now deprecated
273 writer_.close();
274 }
275 catch (Exception e) {
276 e.printStackTrace();
277 }
278 }
279
280 protected void print(String s)
281 {
282 System.err.print(s);
283 }
284
285 protected void println(String s)
286 {
287 System.err.println(s);
288 }
289
290 public void startDocument() throws SAXException
291 {
292 println("Starting to process " + file_id_);
293 print("[");
294 }
295
296 public void endDocument() throws SAXException
297 {
298 println("]");
299 println("... processing finished.");
300 }
301
302 public void startElement(String uri, String localName, String qName, Attributes atts)
303 throws SAXException
304 {
305 path_ = appendPathLink(path_, qName, atts);
306
307 if (qName.equals(doc_tag_level_)) {
308 mode_ = atts.getValue("gs2:mode");
309
310 pushOnStack(); // start new doc
311 current_node_ = qName;
312
313 //String node_id = atts.getValue("gs2:id");
314 //print(" " + qName + ": " + node_id + " (" + mode_ + ")" );
315 //current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.NOT_ANALYZED));
316
317 current_doc_oid_ = atts.getValue("gs2:docOID");
318 print(" " + qName + ": " + current_doc_oid_ + " (" + mode_ + ")" );
319 current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.NOT_ANALYZED));
320 }
321
322 if (isIndexable(atts)) {
323 indexable_current_node_ = qName;
324 if (isTokenized(atts)) {
325 tokenize = true;
326 } else {
327 tokenize = false;
328 }
329 }
330 else {
331 indexable_current_node_ = "";
332 }
333 }
334
335 public static boolean isTokenized(Attributes atts) {
336 boolean tokenize = true;
337 String tok = atts.getValue("tokenize");
338 if (tok!=null && tok.equals("0")) {
339 tokenize = false;
340 }
341 return tokenize;
342 }
343
344 public static boolean isIndexable(Attributes atts)
345 {
346 boolean is_indexable = false;
347
348 String index = atts.getValue("index");
349 if (index!=null) {
350 if (index.equals("1")) {
351 is_indexable = true;
352 }
353 }
354 return is_indexable;
355 }
356
357 public void endElement(String uri, String localName, String qName) throws SAXException
358 {
359 if (mode_.equals("delete")) {
360 try {
361 deleteDocument(current_doc_oid_);
362 }
363 catch (java.io.IOException e) {
364 e.printStackTrace();
365 }
366 }
367 else if (mode_.equals("add") || mode_.equals("update")) {
368 if (qName.equals(indexable_current_node_))
369 {
370 if (tokenize) {
371 current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES));
372 } else {
373 current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.NO));
374 }
375 // // The byXX fields are used for sorting search results
376 // // We don't want to do that for Text or AllFields fields
377 // // They need to be untokenised for sorting
378 // if (!qName.equals("TX") && !qName.equals("ZZ"))
379 // {
380 // current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.NO));
381 // }
382
383 current_contents_ = "";
384 }
385
386 if (qName.equals(doc_tag_level_)) {
387 try {
388 // perhaps this is more efficient if addDocument()
389 // used for "add" and updateDocument() for "update"
390 writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_, analyzer_);
391 }
392 catch (java.io.IOException e) {
393 e.printStackTrace();
394 }
395 popOffStack(); // end document
396 }
397
398 path_ = removePathLink(path_);
399 }
400 }
401
402 public void characters(char ch[], int start, int length) throws SAXException
403 {
404 String data = new String(ch, start, length).trim();
405 if (data.length() > 0 ) {
406 current_contents_ += data;
407 }
408 }
409
410 protected String appendPathLink(String path, String qName, Attributes atts)
411 {
412
413 path = path + "/"+qName;
414 if (atts.getLength()>0) {
415 // was gs2:id, changed to gs2:docOID --kjdon
416 String id = atts.getValue("gs2:docOID");
417 if (id != null) {
418 path += "[@gs2:docOID='"+id+"']";
419 }
420 else {
421 // is this ever used? not in perl currently
422 id = atts.getValue("gs3:id");
423 if (id != null) {
424 path += "[@gs3:id='"+id+"']";
425 }
426 }
427 }
428 return path;
429 }
430
431 protected String removePathLink(String path)
432 {
433
434 int i=path.lastIndexOf('/');
435 if (i==-1) {
436 path="";
437 } else {
438 path = path.substring(0, i);
439 }
440 return path;
441 }
442
443
444 /** these are what we save on the stack */
445 private class MyDocument
446 {
447 public Document doc = null;
448 public String contents = null;
449 public String tagname = "";
450
451 }
452
453
454 protected void pushOnStack()
455 {
456 if (current_doc_ != null) {
457 MyDocument save = new MyDocument();
458 save.doc = current_doc_;
459 save.contents = current_contents_;
460 save.tagname = current_node_;
461 stack_.push(save);
462 }
463 current_doc_ = new Document();
464 current_contents_ = "";
465 current_node_ = "";
466 }
467
468 protected void popOffStack()
469 {
470 if (!stack_.empty()) {
471 MyDocument saved = (MyDocument)stack_.pop();
472 current_doc_ = saved.doc;
473 current_contents_ = saved.contents;
474 current_node_ = saved.tagname;
475 } else {
476 current_doc_ = new Document();
477 current_contents_ = "";
478 current_node_ = "";
479 }
480 }
481
482
483 protected void deleteDocument(String doc_id)
484 throws IOException
485 {
486 debug("GS2LuceneDelete.deleteDocument(" + doc_id + ")");
487 debug("- Initial number of documents in index: " + writer_.numDocs());
488 writer_.deleteDocuments(new Term("docOID", doc_id));
489 debug("- Final number of documents in index: " + writer_.numDocs());
490 }
491
492
493 }
494}
Note: See TracBrowser for help on using the repository browser.