source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/lucene-version-3.3/GS2LuceneIndexer.java@ 24716

Last change on this file since 24716 was 24716, checked in by davidb, 13 years ago

A version of the LuceneWrapper code that works with Lucene version 3.3

  • Property svn:executable set to *
File size: 12.1 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneIndexer.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26
27package org.greenstone.LuceneWrapper;
28
29
30import java.io.*;
31import java.util.Vector;
32
33import org.xml.sax.Attributes;
34import org.xml.sax.helpers.DefaultHandler;
35import org.xml.sax.InputSource;
36import org.xml.sax.SAXException;
37import org.xml.sax.XMLReader;
38
39import javax.xml.parsers.SAXParser;
40import javax.xml.parsers.SAXParserFactory;
41
42import org.apache.lucene.document.Document;
43import org.apache.lucene.document.Field;
44import org.apache.lucene.index.IndexWriter;
45import org.apache.lucene.index.Term;
46import org.apache.lucene.analysis.Analyzer;
47
48import org.apache.lucene.store.SimpleFSDirectory;
49import org.apache.lucene.index.IndexWriter.MaxFieldLength;
50
51import java.util.Stack;
52import java.io.FileInputStream;
53import java.io.File;
54import java.io.StringReader;
55import java.net.URL;
56
57
58/**
59 * class for indexing XML generated by lucenebuildproc.pm
60 */
61
62public class GS2LuceneIndexer {
63
64 protected static boolean debug = false;
65
66 protected static void debug(String message)
67 {
68 if (debug) {
69 System.err.println(message);
70 }
71 }
72
73
74 public static void main (String args[]) throws Exception
75 {
76 int verbosity = 1;
77 // Default is to edit the existing index
78 boolean create_new_index = false;
79
80 Vector filtered_args = new Vector();
81
82 int argc = args.length;
83 int i = 0;
84 while (i<argc) {
85 if (args[i].startsWith("-")) {
86
87 // -removeold causes the existing index to be overwritten
88 if (args[i].equals("-removeold")) {
89 create_new_index = true;
90 }
91
92 // -verbosity [num]
93 else if (args[i].equals("-verbosity")) {
94 i++;
95 if (i<argc) {
96 verbosity = Integer.parseInt(args[i]);
97 if (verbosity>=5) {
98 debug = true;
99 }
100 }
101 }
102 else if (args[i].equals("-debug")) {
103 debug = true;
104 }
105 else {
106 System.err.println("Unrecognised option: " + args[i]);
107 }
108 }
109 else {
110 filtered_args.add((Object)args[i]);
111 }
112 i++;
113 }
114
115 if (filtered_args.size() != 3) {
116 System.err.println("Usage: java GS2LuceneIndexer [-removeold|-verbosity [num]] doc-tag-level building_dir index");
117 return;
118 }
119
120 String doc_tag_level = (String)filtered_args.get(0);
121 String building_dirname = (String)filtered_args.get(1);
122 String index_dirname = (String)filtered_args.get(2);
123
124 String import_dirname = building_dirname + File.separator + "text";
125
126 File import_dir = new File(import_dirname);
127 File building_dir = new File(building_dirname);
128
129 if (!import_dir.exists()) {
130 System.err.println("Couldn't find import directory: "+import_dirname);
131 return;
132 }
133
134 File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator);
135 idx_dir.mkdir();
136
137 // Set up indexer
138 Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index);
139
140 // Read from stdin the files to process
141 try {
142 InputStreamReader isr = new InputStreamReader(System.in, "UTF-8");
143 BufferedReader brin = new BufferedReader(isr);
144
145 StringBuffer xml_text = new StringBuffer(1024);
146 String line = null;
147 while ((line = brin.readLine()) != null) {
148 xml_text.append(line);
149 xml_text.append(" ");
150
151 debug("Got line " + line);
152
153 if (line.endsWith("</Delete>")) {
154
155 indexer.delete(xml_text.toString());
156 xml_text = new StringBuffer(1024);
157 }
158 else if (line.startsWith("</Doc>")) {
159 indexer.index(xml_text.toString());
160 xml_text = new StringBuffer(1024);
161 }
162 }
163
164 brin.close();
165 isr.close();
166
167 } catch (IOException e) {
168 System.err.println("Error: unable to read from stdin");
169 e.printStackTrace();
170 }
171
172 indexer.finish();
173 }
174
175
176 static public class Indexer extends DefaultHandler
177 {
178 IndexWriter writer_ = null;
179 Analyzer analyzer_ = null;
180 SAXParser sax_parser_ = null;
181 String doc_tag_level_ = null;
182
183 Stack stack_ = null;
184 String path_ = "";
185
186 Document current_doc_ = null;
187 String current_node_ = "";
188 String current_doc_oid_ = "";
189 String indexable_current_node_ = "";
190 String current_contents_ = "";
191
192 String mode_ = "";
193 protected String file_id_ = null;
194
195 /** pass in true if want to create a new index, false if want to use the existing one */
196 public Indexer (String doc_tag_level, File index_dir, boolean create)
197 {
198 doc_tag_level_ = doc_tag_level;
199
200 try {
201 stack_ = new Stack();
202 SAXParserFactory sax_factory = SAXParserFactory.newInstance();
203 sax_parser_ = sax_factory.newSAXParser();
204
205 XMLReader reader = sax_parser_.getXMLReader();
206 reader.setFeature("http://xml.org/sax/features/validation", false);
207
208 SimpleFSDirectory index_dir_dir = new SimpleFSDirectory(new File(index_dir.getPath()));
209
210 analyzer_ = new GS2Analyzer(); // uses build in stop_word_set
211
212 writer_ = new IndexWriter(index_dir_dir, analyzer_, create, MaxFieldLength.UNLIMITED);
213
214 // by default, will only index 10,000 words per document
215 // Can throw out_of_memory errors
216 writer_.setMaxFieldLength(Integer.MAX_VALUE);
217 if (create) {
218 writer_.optimize();
219 }
220 }
221 catch (Exception e) {
222 // We need to know if creating/opening the index fails
223 e.printStackTrace();
224 }
225 }
226
227 /** index one document */
228 public void index (String file_id, File file)
229 {
230 mode_ = "add";
231 file_id_ = file_id;
232 path_ = "";
233 String base_path = file.getPath();
234 base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
235
236 try {
237 sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
238 }
239 catch (Exception e) {
240 println("parse error:");
241 e.printStackTrace();
242 }
243 }
244
245 /** index one document stored as string*/
246 public void index (String xml_text)
247 {
248 mode_ = "add";
249 file_id_ = "<xml doc on stdin>";
250 path_ = "";
251
252 try {
253 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
254 }
255 catch (Exception e) {
256 println("parse error:");
257 e.printStackTrace();
258 }
259 }
260
261 /** delete one document, based on doc_id in <Delete>doc_id</Delete> */
262 public void delete(String xml_text)
263 {
264 mode_ = "delete";
265 file_id_ = "<delete doc>";
266 path_ = "";
267
268 try {
269 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
270 }
271 catch (Exception e) {
272 println("parse error:");
273 e.printStackTrace();
274 }
275 }
276
277 public void finish()
278 {
279 /** optimise the index */
280 try {
281 writer_.optimize();
282 writer_.close();
283 }
284 catch (Exception e) {
285 }
286 }
287
288 protected void print(String s)
289 {
290 System.err.print(s);
291 }
292
293 protected void println(String s)
294 {
295 System.err.println(s);
296 }
297
298 public void startDocument() throws SAXException
299 {
300 println("Starting to process " + file_id_);
301 print("[");
302 }
303
304 public void endDocument() throws SAXException
305 {
306 println("]");
307 println("... processing finished.");
308 }
309
310 public void startElement(String uri, String localName, String qName, Attributes atts)
311 throws SAXException
312 {
313 path_ = appendPathLink(path_, qName, atts);
314
315 if (qName.equals(doc_tag_level_)) {
316 mode_ = atts.getValue("gs2:mode");
317
318 pushOnStack(); // start new doc
319 current_node_ = qName;
320
321 //String node_id = atts.getValue("gs2:id");
322 //print(" " + qName + ": " + node_id + " (" + mode_ + ")" );
323 //current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.NOT_ANALYZED));
324
325 current_doc_oid_ = atts.getValue("gs2:docOID");
326 print(" " + qName + ": " + current_doc_oid_ + " (" + mode_ + ")" );
327 current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.NOT_ANALYZED));
328 }
329
330 if (isIndexable(atts)) {
331 indexable_current_node_ = qName;
332 }
333 else {
334 indexable_current_node_ = "";
335 }
336 }
337
338 public static boolean isIndexable(Attributes atts)
339 {
340 boolean is_indexable = false;
341
342 String index = atts.getValue("index");
343 if (index!=null) {
344 if (index.equals("1")) {
345 is_indexable = true;
346 }
347 }
348 return is_indexable;
349 }
350
351 public void endElement(String uri, String localName, String qName) throws SAXException
352 {
353 if (mode_.equals("delete")) {
354 try {
355 deleteDocument(current_doc_oid_);
356 }
357 catch (java.io.IOException e) {
358 e.printStackTrace();
359 }
360 }
361 else if (mode_.equals("add") || mode_.equals("update")) {
362 if (qName.equals(indexable_current_node_))
363 {
364 current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES));
365 // The byXX fields are used for sorting search results
366 // We don't want to do that for Text or AllFields fields
367 // They need to be untokenised for sorting
368 if (!qName.equals("TX") && !qName.equals("ZZ"))
369 {
370 current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.NO));
371 }
372
373 current_contents_ = "";
374 }
375
376 if (qName.equals(doc_tag_level_)) {
377 try {
378 // perhaps this is more efficient if addDocument()
379 // used for "add" and updateDocument() for "update"
380 writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_, analyzer_);
381 }
382 catch (java.io.IOException e) {
383 e.printStackTrace();
384 }
385 popOffStack(); // end document
386 }
387
388 path_ = removePathLink(path_);
389 }
390 }
391
392 public void characters(char ch[], int start, int length) throws SAXException
393 {
394 String data = new String(ch, start, length).trim();
395 if (data.length() > 0 ) {
396 current_contents_ += data;
397 }
398 }
399
400 protected String appendPathLink(String path, String qName, Attributes atts)
401 {
402
403 path = path + "/"+qName;
404 if (atts.getLength()>0) {
405 // was gs2:id, changed to gs2:docOID --kjdon
406 String id = atts.getValue("gs2:docOID");
407 if (id != null) {
408 path += "[@gs2:docOID='"+id+"']";
409 }
410 else {
411 // is this ever used? not in perl currently
412 id = atts.getValue("gs3:id");
413 if (id != null) {
414 path += "[@gs3:id='"+id+"']";
415 }
416 }
417 }
418 return path;
419 }
420
421 protected String removePathLink(String path)
422 {
423
424 int i=path.lastIndexOf('/');
425 if (i==-1) {
426 path="";
427 } else {
428 path = path.substring(0, i);
429 }
430 return path;
431 }
432
433
434 /** these are what we save on the stack */
435 private class MyDocument
436 {
437 public Document doc = null;
438 public String contents = null;
439 public String tagname = "";
440
441 }
442
443
444 protected void pushOnStack()
445 {
446 if (current_doc_ != null) {
447 MyDocument save = new MyDocument();
448 save.doc = current_doc_;
449 save.contents = current_contents_;
450 save.tagname = current_node_;
451 stack_.push(save);
452 }
453 current_doc_ = new Document();
454 current_contents_ = "";
455 current_node_ = "";
456 }
457
458 protected void popOffStack()
459 {
460 if (!stack_.empty()) {
461 MyDocument saved = (MyDocument)stack_.pop();
462 current_doc_ = saved.doc;
463 current_contents_ = saved.contents;
464 current_node_ = saved.tagname;
465 } else {
466 current_doc_ = new Document();
467 current_contents_ = "";
468 current_node_ = "";
469 }
470 }
471
472
473 protected void deleteDocument(String doc_id)
474 throws IOException
475 {
476 debug("GS2LuceneDelete.deleteDocument(" + doc_id + ")");
477 debug("- Initial number of documents in index: " + writer_.numDocs());
478 writer_.deleteDocuments(new Term("docOID", doc_id));
479 debug("- Final number of documents in index: " + writer_.numDocs());
480 }
481
482
483 }
484}
Note: See TracBrowser for help on using the repository browser.