source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java@ 24731

Last change on this file since 24731 was 24731, checked in by sjm84, 13 years ago

Lucene 3.x version of code accidentally commited rolling back to 2.x compatible version

  • Property svn:keywords set to Author Date Id Revision
File size: 11.9 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneIndexer.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26
27package org.greenstone.LuceneWrapper;
28
29
30import java.io.*;
31import java.util.Vector;
32
33import org.xml.sax.Attributes;
34import org.xml.sax.helpers.DefaultHandler;
35import org.xml.sax.InputSource;
36import org.xml.sax.SAXException;
37import org.xml.sax.XMLReader;
38
39import javax.xml.parsers.SAXParser;
40import javax.xml.parsers.SAXParserFactory;
41
42import org.apache.lucene.document.Document;
43import org.apache.lucene.document.Field;
44import org.apache.lucene.index.IndexWriter;
45import org.apache.lucene.index.Term;
46import org.apache.lucene.analysis.Analyzer;
47
48import java.util.Stack;
49import java.io.FileInputStream;
50import java.io.File;
51import java.io.StringReader;
52import java.net.URL;
53
54
55/**
56 * class for indexing XML generated by lucenebuildproc.pm
57 */
58
59public class GS2LuceneIndexer {
60
61 protected static boolean debug = false;
62
63 protected static void debug(String message)
64 {
65 if (debug) {
66 System.err.println(message);
67 }
68 }
69
70
71 public static void main (String args[]) throws Exception
72 {
73 int verbosity = 1;
74 // Default is to edit the existing index
75 boolean create_new_index = false;
76
77 Vector filtered_args = new Vector();
78
79 int argc = args.length;
80 int i = 0;
81 while (i<argc) {
82 if (args[i].startsWith("-")) {
83
84 // -removeold causes the existing index to be overwritten
85 if (args[i].equals("-removeold")) {
86 create_new_index = true;
87 }
88
89 // -verbosity [num]
90 else if (args[i].equals("-verbosity")) {
91 i++;
92 if (i<argc) {
93 verbosity = Integer.parseInt(args[i]);
94 if (verbosity>=5) {
95 debug = true;
96 }
97 }
98 }
99 else if (args[i].equals("-debug")) {
100 debug = true;
101 }
102 else {
103 System.err.println("Unrecognised option: " + args[i]);
104 }
105 }
106 else {
107 filtered_args.add((Object)args[i]);
108 }
109 i++;
110 }
111
112 if (filtered_args.size() != 3) {
113 System.err.println("Usage: java GS2LuceneIndexer [-removeold|-verbosity [num]] doc-tag-level building_dir index");
114 return;
115 }
116
117 String doc_tag_level = (String)filtered_args.get(0);
118 String building_dirname = (String)filtered_args.get(1);
119 String index_dirname = (String)filtered_args.get(2);
120
121 String import_dirname = building_dirname + File.separator + "text";
122
123 File import_dir = new File(import_dirname);
124 File building_dir = new File(building_dirname);
125
126 if (!import_dir.exists()) {
127 System.err.println("Couldn't find import directory: "+import_dirname);
128 return;
129 }
130
131 File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator);
132 idx_dir.mkdir();
133
134 // Set up indexer
135 Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index);
136
137 // Read from stdin the files to process
138 try {
139 InputStreamReader isr = new InputStreamReader(System.in, "UTF-8");
140 BufferedReader brin = new BufferedReader(isr);
141
142 StringBuffer xml_text = new StringBuffer(1024);
143 String line = null;
144 while ((line = brin.readLine()) != null) {
145 xml_text.append(line);
146 xml_text.append(" ");
147
148 debug("Got line " + line);
149
150 if (line.endsWith("</Delete>")) {
151
152 indexer.delete(xml_text.toString());
153 xml_text = new StringBuffer(1024);
154 }
155 else if (line.startsWith("</Doc>")) {
156 indexer.index(xml_text.toString());
157 xml_text = new StringBuffer(1024);
158 }
159 }
160
161 brin.close();
162 isr.close();
163
164 } catch (IOException e) {
165 System.err.println("Error: unable to read from stdin");
166 e.printStackTrace();
167 }
168
169 indexer.finish();
170 }
171
172
173 static public class Indexer extends DefaultHandler
174 {
175 IndexWriter writer_ = null;
176 Analyzer analyzer_ = null;
177 SAXParser sax_parser_ = null;
178 String doc_tag_level_ = null;
179
180 Stack stack_ = null;
181 String path_ = "";
182
183 Document current_doc_ = null;
184 String current_node_ = "";
185 String current_doc_oid_ = "";
186 String indexable_current_node_ = "";
187 String current_contents_ = "";
188
189 String mode_ = "";
190 protected String file_id_ = null;
191
192 static private String[] stop_words = GS2Analyzer.STOP_WORDS;
193
194
195 /** pass in true if want to create a new index, false if want to use the existing one */
196 public Indexer (String doc_tag_level, File index_dir, boolean create)
197 {
198 doc_tag_level_ = doc_tag_level;
199
200 try {
201 stack_ = new Stack();
202 SAXParserFactory sax_factory = SAXParserFactory.newInstance();
203 sax_parser_ = sax_factory.newSAXParser();
204
205 XMLReader reader = sax_parser_.getXMLReader();
206 reader.setFeature("http://xml.org/sax/features/validation", false);
207
208 analyzer_ = new GS2Analyzer(stop_words);
209
210 writer_ = new IndexWriter(index_dir.getPath(), analyzer_, create);
211 // by default, will only index 10,000 words per document
212 // Can throw out_of_memory errors
213 writer_.setMaxFieldLength(Integer.MAX_VALUE);
214 if (create) {
215 writer_.optimize();
216 }
217 }
218 catch (Exception e) {
219 // We need to know if creating/opening the index fails
220 e.printStackTrace();
221 }
222 }
223
224 /** index one document */
225 public void index (String file_id, File file)
226 {
227 mode_ = "add";
228 file_id_ = file_id;
229 path_ = "";
230 String base_path = file.getPath();
231 base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
232
233 try {
234 sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
235 }
236 catch (Exception e) {
237 println("parse error:");
238 e.printStackTrace();
239 }
240 }
241
242 /** index one document stored as string*/
243 public void index (String xml_text)
244 {
245 mode_ = "add";
246 file_id_ = "<xml doc on stdin>";
247 path_ = "";
248
249 try {
250 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
251 }
252 catch (Exception e) {
253 println("parse error:");
254 e.printStackTrace();
255 }
256 }
257
258 /** delete one document, based on doc_id in <Delete>doc_id</Delete> */
259 public void delete(String xml_text)
260 {
261 mode_ = "delete";
262 file_id_ = "<delete doc>";
263 path_ = "";
264
265 try {
266 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
267 }
268 catch (Exception e) {
269 println("parse error:");
270 e.printStackTrace();
271 }
272 }
273
274 public void finish()
275 {
276 /** optimise the index */
277 try {
278 writer_.optimize();
279 writer_.close();
280 }
281 catch (Exception e) {
282 }
283 }
284
285 protected void print(String s)
286 {
287 System.err.print(s);
288 }
289
290 protected void println(String s)
291 {
292 System.err.println(s);
293 }
294
295 public void startDocument() throws SAXException
296 {
297 println("Starting to process " + file_id_);
298 print("[");
299 }
300
301 public void endDocument() throws SAXException
302 {
303 println("]");
304 println("... processing finished.");
305 }
306
307 public void startElement(String uri, String localName, String qName, Attributes atts)
308 throws SAXException
309 {
310 path_ = appendPathLink(path_, qName, atts);
311
312 if (qName.equals(doc_tag_level_)) {
313 mode_ = atts.getValue("gs2:mode");
314
315 pushOnStack(); // start new doc
316 current_node_ = qName;
317
318 //String node_id = atts.getValue("gs2:id");
319 //print(" " + qName + ": " + node_id + " (" + mode_ + ")" );
320 //current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED));
321
322 current_doc_oid_ = atts.getValue("gs2:docOID");
323 print(" " + qName + ": " + current_doc_oid_ + " (" + mode_ + ")" );
324 current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.UN_TOKENIZED));
325 }
326
327 if (isIndexable(atts)) {
328 indexable_current_node_ = qName;
329 }
330 else {
331 indexable_current_node_ = "";
332 }
333 }
334
335 public static boolean isIndexable(Attributes atts)
336 {
337 boolean is_indexable = false;
338
339 String index = atts.getValue("index");
340 if (index!=null) {
341 if (index.equals("1")) {
342 is_indexable = true;
343 }
344 }
345 return is_indexable;
346 }
347
348 public void endElement(String uri, String localName, String qName) throws SAXException
349 {
350 if (mode_.equals("delete")) {
351 try {
352 deleteDocument(current_doc_oid_);
353 }
354 catch (java.io.IOException e) {
355 e.printStackTrace();
356 }
357 }
358 else if (mode_.equals("add") || mode_.equals("update")) {
359 if (qName.equals(indexable_current_node_))
360 {
361 current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
362 // The byXX fields are used for sorting search results
363 // We don't want to do that for Text or AllFields fields
364 // They need to be untokenised for sorting
365 if (!qName.equals("TX") && !qName.equals("ZZ"))
366 {
367 current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO));
368 }
369
370 current_contents_ = "";
371 }
372
373 if (qName.equals(doc_tag_level_)) {
374 try {
375 // perhaps this is more efficient if addDocument()
376 // used for "add" and updateDocument() for "update"
377 writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_, analyzer_);
378 }
379 catch (java.io.IOException e) {
380 e.printStackTrace();
381 }
382 popOffStack(); // end document
383 }
384
385 path_ = removePathLink(path_);
386 }
387 }
388
389 public void characters(char ch[], int start, int length) throws SAXException
390 {
391 String data = new String(ch, start, length).trim();
392 if (data.length() > 0 ) {
393 current_contents_ += data;
394 }
395 }
396
397 protected String appendPathLink(String path, String qName, Attributes atts)
398 {
399
400 path = path + "/"+qName;
401 if (atts.getLength()>0) {
402 // was gs2:id, changed to gs2:docOID --kjdon
403 String id = atts.getValue("gs2:docOID");
404 if (id != null) {
405 path += "[@gs2:docOID='"+id+"']";
406 }
407 else {
408 // is this ever used? not in perl currently
409 id = atts.getValue("gs3:id");
410 if (id != null) {
411 path += "[@gs3:id='"+id+"']";
412 }
413 }
414 }
415 return path;
416 }
417
418 protected String removePathLink(String path)
419 {
420
421 int i=path.lastIndexOf('/');
422 if (i==-1) {
423 path="";
424 } else {
425 path = path.substring(0, i);
426 }
427 return path;
428 }
429
430
431 /** these are what we save on the stack */
432 private class MyDocument
433 {
434 public Document doc = null;
435 public String contents = null;
436 public String tagname = "";
437
438 }
439
440
441 protected void pushOnStack()
442 {
443 if (current_doc_ != null) {
444 MyDocument save = new MyDocument();
445 save.doc = current_doc_;
446 save.contents = current_contents_;
447 save.tagname = current_node_;
448 stack_.push(save);
449 }
450 current_doc_ = new Document();
451 current_contents_ = "";
452 current_node_ = "";
453 }
454
455 protected void popOffStack()
456 {
457 if (!stack_.empty()) {
458 MyDocument saved = (MyDocument)stack_.pop();
459 current_doc_ = saved.doc;
460 current_contents_ = saved.contents;
461 current_node_ = saved.tagname;
462 } else {
463 current_doc_ = new Document();
464 current_contents_ = "";
465 current_node_ = "";
466 }
467 }
468
469
470 protected void deleteDocument(String doc_id)
471 throws IOException
472 {
473 debug("GS2LuceneDelete.deleteDocument(" + doc_id + ")");
474 debug("- Initial number of documents in index: " + writer_.docCount());
475 writer_.deleteDocuments(new Term("docOID", doc_id));
476 debug("- Final number of documents in index: " + writer_.docCount());
477 }
478
479
480 }
481}
Note: See TracBrowser for help on using the repository browser.