/********************************************************************** * * GS2LuceneIndexer.java * * Copyright 2004 The New Zealand Digital Library Project * * A component of the Greenstone digital library software * from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * *********************************************************************/ package org.greenstone.LuceneWrapper; import java.io.*; import java.util.Vector; import org.xml.sax.Attributes; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.analysis.standard.StandardAnalyzer; import java.util.Stack; import java.io.FileInputStream; import java.io.File; import java.io.StringReader; import java.net.URL; /** * class for indexing XML generated by lucenebuildproc.pm */ public class GS2LuceneIndexer { public static void main (String args[]) throws Exception { int verbosity = 1; // Default is to edit the existing index boolean create_new_index = false; Vector filtered_args = new Vector(); int argc = args.length; int i = 0; while (i")) { indexer.index(xml_text.toString()); xml_text = new StringBuffer(1024); } } brin.close(); isr.close(); } catch (IOException e) { System.err.println("Error: unable to read from stdin"); e.printStackTrace(); } indexer.finish(); } static public class Indexer extends DefaultHandler { IndexWriter writer_ = null; SAXParser sax_parser_ = null; String doc_tag_level_ = null; Stack stack_ = null; String path_ = ""; Document current_doc_ = null; String current_node_ = ""; String current_doc_oid_ = ""; String indexable_current_node_ = ""; String current_contents_ = ""; protected String file_id_ = null; /** pass in true if want to create a new index, false if want to use the existing one */ public Indexer (String doc_tag_level, File index_dir, boolean create) { doc_tag_level_ = doc_tag_level; try { stack_ = new Stack(); SAXParserFactory sax_factory = SAXParserFactory.newInstance(); sax_parser_ = sax_factory.newSAXParser(); XMLReader reader = sax_parser_.getXMLReader(); reader.setFeature("http://xml.org/sax/features/validation", false); writer_ = new IndexWriter(index_dir.getPath(), new StandardAnalyzer(), create); // by default, will only index 10,000 words per document // Can throw out_of_memory errors writer_.setMaxFieldLength(Integer.MAX_VALUE); if (create) { writer_.optimize(); } } catch (Exception e) { // do nothing! } } /** index one document */ public void index (String file_id, File file) { file_id_ = file_id; path_ = ""; String base_path = file.getPath(); base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar)); try { sax_parser_.parse(new InputSource(new FileInputStream(file)), this); } catch (Exception e) { println("parse error:"); e.printStackTrace(); } } /** index one document stored as string*/ public void index (String xml_text) { file_id_ = ""; path_ = ""; try { sax_parser_.parse(new InputSource(new StringReader(xml_text)), this); } catch (Exception e) { println("parse error:"); e.printStackTrace(); } } public void finish() { /** optimise the index */ try { writer_.optimize(); writer_.close(); } catch (Exception e) { } } protected void print(String s) { System.out.print(s); } protected void println(String s) { System.out.println(s); } public void startDocument() throws SAXException { println("Starting to index " + file_id_); print("["); } public void endDocument() throws SAXException { println("]"); println("... indexing finished."); } public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { path_ = appendPathLink(path_, qName, atts); if (qName.equals(doc_tag_level_)) { pushOnStack(); // start new doc current_node_ = qName; String node_id = atts.getValue("gs2:id"); print(" " + qName + ": " + node_id ); current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED)); current_doc_oid_ = atts.getValue("gs2:docOID"); current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.UN_TOKENIZED)); } if (XMLTagInfo.isIndexable(atts)) { indexable_current_node_ = qName; } else { indexable_current_node_ = ""; } } public void endElement(String uri, String localName, String qName) throws SAXException { if (qName.equals(indexable_current_node_)) { current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES)); // We only need the term vector for the TX field if (!qName.equals("TX")) { current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO)); } current_contents_ = ""; } if (qName.equals(doc_tag_level_)) { try { writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_); } catch (java.io.IOException e) { e.printStackTrace(); } popOffStack(); // end document } path_ = removePathLink(path_); } public void characters(char ch[], int start, int length) throws SAXException { String data = new String(ch, start, length).trim(); if (data.length() > 0 ) { current_contents_ += data; } } protected String appendPathLink(String path, String qName, Attributes atts) { path = path + "/"+qName; if (atts.getLength()>0) { String id = atts.getValue("gs2:id"); if (id != null) { path += "[@gs2:id='"+id+"']"; } else { id = atts.getValue("gs3:id"); if (id != null) { path += "[@gs3:id='"+id+"']"; } } } return path; } protected String removePathLink(String path) { int i=path.lastIndexOf('/'); if (i==-1) { path=""; } else { path = path.substring(0, i); } return path; } /** these are what we save on the stack */ private class MyDocument { public Document doc = null; public String contents = null; public String tagname = ""; } protected void pushOnStack() { if (current_doc_ != null) { MyDocument save = new MyDocument(); save.doc = current_doc_; save.contents = current_contents_; save.tagname = current_node_; stack_.push(save); } current_doc_ = new Document(); current_contents_ = ""; current_node_ = ""; } protected void popOffStack() { if (!stack_.empty()) { MyDocument saved = (MyDocument)stack_.pop(); current_doc_ = saved.doc; current_contents_ = saved.contents; current_node_ = saved.tagname; } else { current_doc_ = new Document(); current_contents_ = ""; current_node_ = ""; } } } }