/********************************************************************** * * GS2LuceneIndexer.java * * Copyright 2004 The New Zealand Digital Library Project * * A component of the Greenstone digital library software * from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * *********************************************************************/ package org.greenstone.LuceneWrapper; import java.io.*; import java.util.Vector; import org.xml.sax.Attributes; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.analysis.Analyzer; import java.util.Stack; import java.io.FileInputStream; import java.io.File; import java.io.StringReader; import java.net.URL; /** * class for indexing XML generated by lucenebuildproc.pm */ public class GS2LuceneIndexer { protected static boolean debug = false; protected static void debug(String message) { if (debug) { System.err.println(message); } } public static void main (String args[]) throws Exception { int verbosity = 1; // Default is to edit the existing index boolean create_new_index = false; Vector filtered_args = new Vector(); int argc = args.length; int i = 0; while (i=5) { debug = true; } } } else if (args[i].equals("-debug")) { debug = true; } else { System.out.println("Unrecognised option: " + args[i]); } } else { filtered_args.add((Object)args[i]); } i++; } if (filtered_args.size() != 3) { System.out.println("Usage: java GS2LuceneIndexer [-removeold|-verbosity [num]] doc-tag-level building_dir index"); return; } String doc_tag_level = (String)filtered_args.get(0); String building_dirname = (String)filtered_args.get(1); String index_dirname = (String)filtered_args.get(2); String import_dirname = building_dirname + File.separator + "text"; File import_dir = new File(import_dirname); File building_dir = new File(building_dirname); if (!import_dir.exists()) { System.out.println("Couldn't find import directory: "+import_dirname); return; } File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator); idx_dir.mkdir(); // Set up indexer Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index); // Read from stdin the files to process try { InputStreamReader isr = new InputStreamReader(System.in, "UTF-8"); BufferedReader brin = new BufferedReader(isr); StringBuffer xml_text = new StringBuffer(1024); String line = null; while ((line = brin.readLine()) != null) { xml_text.append(line); xml_text.append(" "); debug("Got line " + line); if (line.endsWith("")) { indexer.delete(xml_text.toString()); xml_text = new StringBuffer(1024); } else if (line.startsWith("")) { indexer.index(xml_text.toString()); xml_text = new StringBuffer(1024); } } brin.close(); isr.close(); } catch (IOException e) { System.err.println("Error: unable to read from stdin"); e.printStackTrace(); } indexer.finish(); } static public class Indexer extends DefaultHandler { IndexWriter writer_ = null; Analyzer analyzer_ = null; SAXParser sax_parser_ = null; String doc_tag_level_ = null; Stack stack_ = null; String path_ = ""; Document current_doc_ = null; String current_node_ = ""; String current_doc_oid_ = ""; String indexable_current_node_ = ""; String current_contents_ = ""; String mode_ = ""; protected String file_id_ = null; static private String[] stop_words = GS2Analyzer.STOP_WORDS; /** pass in true if want to create a new index, false if want to use the existing one */ public Indexer (String doc_tag_level, File index_dir, boolean create) { doc_tag_level_ = doc_tag_level; try { stack_ = new Stack(); SAXParserFactory sax_factory = SAXParserFactory.newInstance(); sax_parser_ = sax_factory.newSAXParser(); XMLReader reader = sax_parser_.getXMLReader(); reader.setFeature("http://xml.org/sax/features/validation", false); analyzer_ = new GS2Analyzer(stop_words); writer_ = new IndexWriter(index_dir.getPath(), analyzer_, create); // by default, will only index 10,000 words per document // Can throw out_of_memory errors writer_.setMaxFieldLength(Integer.MAX_VALUE); if (create) { writer_.optimize(); } } catch (Exception e) { // We need to know if creating/opening the index fails e.printStackTrace(); } } /** index one document */ public void index (String file_id, File file) { mode_ = "add"; file_id_ = file_id; path_ = ""; String base_path = file.getPath(); base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar)); try { sax_parser_.parse(new InputSource(new FileInputStream(file)), this); } catch (Exception e) { println("parse error:"); e.printStackTrace(); } } /** index one document stored as string*/ public void index (String xml_text) { mode_ = "add"; file_id_ = ""; path_ = ""; try { sax_parser_.parse(new InputSource(new StringReader(xml_text)), this); } catch (Exception e) { println("parse error:"); e.printStackTrace(); } } /** delete one document, based on doc_id in doc_id */ public void delete(String xml_text) { mode_ = "delete"; file_id_ = ""; path_ = ""; try { sax_parser_.parse(new InputSource(new StringReader(xml_text)), this); } catch (Exception e) { println("parse error:"); e.printStackTrace(); } } public void finish() { /** optimise the index */ try { writer_.optimize(); writer_.close(); } catch (Exception e) { } } protected void print(String s) { System.out.print(s); } protected void println(String s) { System.out.println(s); } public void startDocument() throws SAXException { println("Starting to process " + file_id_); print("["); } public void endDocument() throws SAXException { println("]"); println("... processing finished."); } public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { path_ = appendPathLink(path_, qName, atts); if (qName.equals(doc_tag_level_)) { mode_ = atts.getValue("gs2:mode"); pushOnStack(); // start new doc current_node_ = qName; String node_id = atts.getValue("gs2:id"); print(" " + qName + ": " + node_id + " (" + mode_ + ")" ); current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED)); current_doc_oid_ = atts.getValue("gs2:docOID"); current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.UN_TOKENIZED)); } if (isIndexable(atts)) { indexable_current_node_ = qName; } else { indexable_current_node_ = ""; } } public static boolean isIndexable(Attributes atts) { boolean is_indexable = false; String index = atts.getValue("index"); if (index!=null) { if (index.equals("1")) { is_indexable = true; } } return is_indexable; } public void endElement(String uri, String localName, String qName) throws SAXException { if (mode_.equals("delete")) { try { deleteDocument(current_doc_oid_); } catch (java.io.IOException e) { e.printStackTrace(); } } else if (mode_.equals("add") || mode_.equals("update")) { if (qName.equals(indexable_current_node_)) { current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES)); // The byXX fields are used for sorting search results // We don't want to do that for Text or AllFields fields // They need to be untokenised for sorting if (!qName.equals("TX") && !qName.equals("ZZ")) { current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO)); } current_contents_ = ""; } if (qName.equals(doc_tag_level_)) { try { // perhaps this is more efficient if addDocument() // used for "add" and updateDocument() for "update" writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_, analyzer_); } catch (java.io.IOException e) { e.printStackTrace(); } popOffStack(); // end document } path_ = removePathLink(path_); } } public void characters(char ch[], int start, int length) throws SAXException { String data = new String(ch, start, length).trim(); if (data.length() > 0 ) { current_contents_ += data; } } protected String appendPathLink(String path, String qName, Attributes atts) { path = path + "/"+qName; if (atts.getLength()>0) { String id = atts.getValue("gs2:id"); if (id != null) { path += "[@gs2:id='"+id+"']"; } else { id = atts.getValue("gs3:id"); if (id != null) { path += "[@gs3:id='"+id+"']"; } } } return path; } protected String removePathLink(String path) { int i=path.lastIndexOf('/'); if (i==-1) { path=""; } else { path = path.substring(0, i); } return path; } /** these are what we save on the stack */ private class MyDocument { public Document doc = null; public String contents = null; public String tagname = ""; } protected void pushOnStack() { if (current_doc_ != null) { MyDocument save = new MyDocument(); save.doc = current_doc_; save.contents = current_contents_; save.tagname = current_node_; stack_.push(save); } current_doc_ = new Document(); current_contents_ = ""; current_node_ = ""; } protected void popOffStack() { if (!stack_.empty()) { MyDocument saved = (MyDocument)stack_.pop(); current_doc_ = saved.doc; current_contents_ = saved.contents; current_node_ = saved.tagname; } else { current_doc_ = new Document(); current_contents_ = ""; current_node_ = ""; } } protected void deleteDocument(String doc_id) throws IOException { debug("GS2LuceneDelete.deleteDocument(" + doc_id + ")"); debug("- Initial number of documents in index: " + writer_.docCount()); writer_.deleteDocuments(new Term("docOID", doc_id)); debug("- Final number of documents in index: " + writer_.docCount()); } } }