import org.xml.sax.Attributes; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.analysis.standard.StandardAnalyzer; import java.util.Stack; import java.io.FileInputStream; import java.io.File; import java.io.StringReader; import java.net.URL; public class Indexer extends DefaultHandler { IndexWriter writer_ = null; SAXParser sax_parser_ = null; String doc_tag_level_ = null; Stack stack_ = null; String path_ = ""; Document current_doc_ = null; String current_node_ = ""; String indexable_current_node_ = ""; String current_contents_ = ""; protected String file_id_ = null; /** pass in true if want to create a new index, false if want to use the existing one */ public Indexer (String doc_tag_level, File index_dir, boolean create) { doc_tag_level_ = doc_tag_level; try { stack_ = new Stack(); SAXParserFactory sax_factory = SAXParserFactory.newInstance(); sax_parser_ = sax_factory.newSAXParser(); XMLReader reader = sax_parser_.getXMLReader(); reader.setFeature("http://xml.org/sax/features/validation", false); writer_ = new IndexWriter(index_dir.getPath(), new StandardAnalyzer(), create); if (create) { writer_.optimize(); } } catch (Exception e) { // do nothing! } } /** index one document */ public void index (String file_id, File file) { file_id_ = file_id; path_ = ""; String base_path = file.getPath(); base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar)); try { sax_parser_.parse(new InputSource(new FileInputStream(file)), this); } catch (Exception e) { println("parse error:"); e.printStackTrace(); } } /** index one document stored as string*/ public void index (String xml_text) { file_id_ = ""; path_ = ""; try { sax_parser_.parse(new InputSource(new StringReader(xml_text)), this); } catch (Exception e) { println("parse error:"); e.printStackTrace(); } } public void finish() { /** optimise the index */ try { writer_.optimize(); writer_.close(); } catch (Exception e) { } } protected void println(String s) { System.out.println(s); } public void startDocument() throws SAXException { println("Starting to index " + file_id_); } public void endDocument() throws SAXException { println("... indexing finished."); } public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { path_ = appendPathLink(path_, qName, atts); if (qName.equals(doc_tag_level_)) { pushOnStack(); // start new doc current_node_ = qName; String node_id = atts.getValue("gs2:id"); System.out.println("**** Indexing "+ qName + " " + node_id ); current_doc_.add(Field.UnIndexed("nodeID", node_id)); } if (XMLTagInfo.isIndexable(atts)) { indexable_current_node_ = qName; } else { indexable_current_node_ = ""; } } public void endElement(String uri, String localName, String qName) throws SAXException { if (qName.equals(indexable_current_node_)) { current_doc_.add(Field.UnStored(qName, current_contents_)); current_contents_ = ""; } if (qName.equals(doc_tag_level_)) { try { writer_.addDocument(current_doc_); } catch (java.io.IOException e) { e.printStackTrace(); } popOffStack(); // end document } path_ = removePathLink(path_); } public void characters(char ch[], int start, int length) throws SAXException { String data = new String(ch, start, length).trim(); if (data.length() > 0 ) { current_contents_ += data; } } protected String appendPathLink(String path, String qName, Attributes atts) { path = path + "/"+qName; if (atts.getLength()>0) { String id = atts.getValue("gs2:id"); if (id != null) { path += "[@gs2:id='"+id+"']"; } else { id = atts.getValue("gs3:id"); if (id != null) { path += "[@gs3:id='"+id+"']"; } } } return path; } protected String removePathLink(String path) { int i=path.lastIndexOf('/'); if (i==-1) { path=""; } else { path = path.substring(0, i); } return path; } /** these are what we save on the stack */ private class MyDocument { public Document doc = null; public String contents = null; public String tagname = ""; } protected void pushOnStack() { if (current_doc_ != null) { MyDocument save = new MyDocument(); save.doc = current_doc_; save.contents = current_contents_; save.tagname = current_node_; stack_.push(save); } current_doc_ = new Document(); current_contents_ = ""; current_node_ = ""; } protected void popOffStack() { if (!stack_.empty()) { MyDocument saved = (MyDocument)stack_.pop(); current_doc_ = saved.doc; current_contents_ = saved.contents; current_node_ = saved.tagname; } else { current_doc_ = new Document(); current_contents_ = ""; current_node_ = ""; } } }