Context Navigation

source: indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java@ 16438

Last change on this file since 16438 was 16438, checked in by mdewsnip, 16 years ago
Now uses Lucene 2.3.2's updateDocument() function instead of addDocument(), in preparation for implementing incremental building.
Property svn:keywords set to `Author Date Id Revision`
File size: 9.5 KB

Rev	Line
[13555]	1	/**********************************************************************
[8521]	2	*
[13555]	3	* GS2LuceneIndexer.java
	4	*
	5	* Copyright 2004 The New Zealand Digital Library Project
	6	*
	7	* A component of the Greenstone digital library software
	8	* from the New Zealand Digital Library Project at the
	9	* University of Waikato, New Zealand.
	10	*
	11	* This program is free software; you can redistribute it and/or modify
	12	* it under the terms of the GNU General Public License as published by
	13	* the Free Software Foundation; either version 2 of the License, or
	14	* (at your option) any later version.
	15	*
	16	* This program is distributed in the hope that it will be useful,
	17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	* GNU General Public License for more details.
	20	*
	21	* You should have received a copy of the GNU General Public License
	22	* along with this program; if not, write to the Free Software
	23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	*
	25	*********************************************************************/
[8521]	26
[13686]	27	package org.greenstone.LuceneWrapper;
[8521]	28
[12257]	29
[8521]	30	import java.io.*;
[10164]	31	import java.util.Vector;
[8521]	32
[16432]	33	import org.xml.sax.Attributes;
	34	import org.xml.sax.helpers.DefaultHandler;
	35	import org.xml.sax.InputSource;
	36	import org.xml.sax.SAXException;
	37	import org.xml.sax.XMLReader;
	38
	39	import javax.xml.parsers.SAXParser;
	40	import javax.xml.parsers.SAXParserFactory;
	41
	42	import org.apache.lucene.document.Document;
	43	import org.apache.lucene.document.Field;
	44	import org.apache.lucene.index.IndexWriter;
[16438]	45	import org.apache.lucene.index.Term;
[16432]	46	import org.apache.lucene.analysis.standard.StandardAnalyzer;
	47
	48	import java.util.Stack;
	49	import java.io.FileInputStream;
	50	import java.io.File;
	51	import java.io.StringReader;
	52	import java.net.URL;
	53
	54
[13555]	55	/**
	56	* class for indexing XML generated by lucenebuildproc.pm
	57	*/
	58
[8521]	59	public class GS2LuceneIndexer {
	60
[10164]	61	public static void main (String args[]) throws Exception
	62	{
	63
	64	int verbosity = 1;
[16262]	65	// Default is to edit the existing index
	66	boolean create_new_index = false;
[10164]	67
	68	Vector filtered_args = new Vector();
	69
	70	int argc = args.length;
	71	int i = 0;
	72	while (i<argc) {
	73	if (args[i].startsWith("-")) {
	74
[16262]	75	// -removeold causes the existing index to be overwritten
	76	if (args[i].equals("-removeold")) {
	77	create_new_index = true;
[10164]	78	}
	79
	80	// -verbosity [num]
	81	else if (args[i].equals("-verbosity")) {
	82	i++;
	83	if (i<argc) {
	84	verbosity = Integer.parseInt(args[i]);
	85	}
	86	}
	87	else {
	88	System.out.println("Unrecognised option: " + args[i]);
	89	}
	90	}
	91	else {
	92	filtered_args.add((Object)args[i]);
	93	}
	94	i++;
	95	}
	96
	97	if (filtered_args.size() != 3) {
[16262]	98	System.out.println("Usage: java GS2LuceneIndexer [-removeold\|-verbosity [num]] doc-tag-level building_dir index");
[8521]	99	return;
	100	}
	101
[10164]	102	String doc_tag_level = (String)filtered_args.get(0);
	103	String building_dirname = (String)filtered_args.get(1);
	104	String index_dirname = (String)filtered_args.get(2);
[8521]	105
	106	String import_dirname = building_dirname + File.separator + "text";
	107
	108	File import_dir = new File(import_dirname);
	109	File building_dir = new File(building_dirname);
	110
	111	if (!import_dir.exists()) {
	112	System.out.println("Couldn't find import directory: "+import_dirname);
	113	return;
	114	}
	115
	116	File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator);
	117	idx_dir.mkdir();
	118
	119	// Set up indexer
[16430]	120	Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index);
[8521]	121
	122	// Read from stdin the files to process
	123	try {
[9988]	124	InputStreamReader isr = new InputStreamReader(System.in, "UTF-8");
[8521]	125	BufferedReader brin = new BufferedReader(isr);
	126
	127	StringBuffer xml_text = new StringBuffer(1024);
	128	String line = null;
	129	while ((line = brin.readLine()) != null) {
	130	xml_text.append(line);
	131	if (line.startsWith("</Doc>")) {
[16430]	132	indexer.index(xml_text.toString());
[8521]	133	xml_text = new StringBuffer(1024);
	134	}
	135	}
	136
	137	brin.close();
	138	isr.close();
	139
	140	} catch (IOException e) {
	141	System.err.println("Error: unable to read from stdin");
	142	e.printStackTrace();
	143	}
	144
	145	indexer.finish();
	146	}
[16432]	147
	148
	149	static public class Indexer extends DefaultHandler
	150	{
	151	IndexWriter writer_ = null;
	152	SAXParser sax_parser_ = null;
	153	String doc_tag_level_ = null;
	154
	155	Stack stack_ = null;
	156	String path_ = "";
	157
	158	Document current_doc_ = null;
	159	String current_node_ = "";
[16438]	160	String current_doc_oid_ = "";
[16432]	161	String indexable_current_node_ = "";
	162	String current_contents_ = "";
	163
	164	protected String file_id_ = null;
	165
	166	/** pass in true if want to create a new index, false if want to use the existing one */
	167	public Indexer (String doc_tag_level, File index_dir, boolean create)
	168	{
	169	doc_tag_level_ = doc_tag_level;
	170
	171	try {
	172	stack_ = new Stack();
	173	SAXParserFactory sax_factory = SAXParserFactory.newInstance();
	174	sax_parser_ = sax_factory.newSAXParser();
	175
	176	XMLReader reader = sax_parser_.getXMLReader();
	177	reader.setFeature("http://xml.org/sax/features/validation", false);
	178
	179	writer_ = new IndexWriter(index_dir.getPath(), new StandardAnalyzer(), create);
	180	// by default, will only index 10,000 words per document
	181	// Can throw out_of_memory errors
	182	writer_.setMaxFieldLength(Integer.MAX_VALUE);
	183	if (create) {
	184	writer_.optimize();
	185	}
	186
	187	} catch (Exception e) {
	188	// do nothing!
	189	}
	190	}
	191
	192	/** index one document */
	193	public void index (String file_id, File file)
	194	{
	195	file_id_ = file_id;
	196	path_ = "";
	197	String base_path = file.getPath();
	198	base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
	199
	200	try {
	201	sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
	202	}
	203	catch (Exception e) {
	204	println("parse error:");
	205	e.printStackTrace();
	206	}
	207	}
	208
	209	/** index one document stored as string*/
	210	public void index (String xml_text)
	211	{
	212	file_id_ = "<xml doc on stdin>";
	213	path_ = "";
	214
	215	try {
	216	sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
	217	}
	218	catch (Exception e) {
	219	println("parse error:");
	220	e.printStackTrace();
	221	}
	222	}
	223
	224	public void finish()
	225	{
	226	/** optimise the index */
	227	try {
	228	writer_.optimize();
	229	writer_.close();
	230	}
	231	catch (Exception e) {
	232	}
	233	}
	234
	235	protected void print(String s)
	236	{
	237	System.out.print(s);
	238	}
	239
	240	protected void println(String s)
	241	{
	242	System.out.println(s);
	243	}
	244
	245	public void startDocument() throws SAXException
	246	{
	247	println("Starting to index " + file_id_);
	248	print("[");
	249	}
	250
	251	public void endDocument() throws SAXException
	252	{
	253	println("]");
	254	println("... indexing finished.");
	255	}
	256
	257	public void startElement(String uri, String localName, String qName, Attributes atts)
	258	throws SAXException
	259	{
	260	path_ = appendPathLink(path_, qName, atts);
	261
	262	if (qName.equals(doc_tag_level_)) {
	263	pushOnStack(); // start new doc
	264	current_node_ = qName;
[16437]	265
[16432]	266	String node_id = atts.getValue("gs2:id");
	267	print(" " + qName + ": " + node_id );
	268	current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED));
[16437]	269
[16438]	270	current_doc_oid_ = atts.getValue("gs2:docOID");
[16437]	271	current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.UN_TOKENIZED));
[16432]	272	}
	273
	274	if (XMLTagInfo.isIndexable(atts)) {
	275	indexable_current_node_ = qName;
	276	}
	277	else {
	278	indexable_current_node_ = "";
	279	}
	280
	281	}
	282
	283	public void endElement(String uri, String localName, String qName) throws SAXException
	284	{
	285	if (qName.equals(indexable_current_node_))
	286	{
	287	current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
	288	// We only need the term vector for the TX field
	289	if (!qName.equals("TX"))
	290	{
	291	current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO));
	292	}
	293
	294	current_contents_ = "";
	295	}
	296
	297	if (qName.equals(doc_tag_level_)) {
	298	try {
[16438]	299	writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_);
[16432]	300	}
	301	catch (java.io.IOException e) {
	302	e.printStackTrace();
	303	}
	304	popOffStack(); // end document
	305	}
	306
	307	path_ = removePathLink(path_);
	308	}
	309
	310	public void characters(char ch[], int start, int length) throws SAXException
	311	{
	312	String data = new String(ch, start, length).trim();
	313	if (data.length() > 0 ) {
	314	current_contents_ += data;
	315	}
	316	}
	317
	318	protected String appendPathLink(String path, String qName, Attributes atts)
	319	{
	320
	321	path = path + "/"+qName;
	322	if (atts.getLength()>0) {
	323	String id = atts.getValue("gs2:id");
	324	if (id != null) {
	325	path += "[@gs2:id='"+id+"']";
	326	}
	327	else {
	328	id = atts.getValue("gs3:id");
	329	if (id != null) {
	330	path += "[@gs3:id='"+id+"']";
	331	}
	332	}
	333	}
	334	return path;
	335	}
	336
	337	protected String removePathLink(String path)
	338	{
	339
	340	int i=path.lastIndexOf('/');
	341	if (i==-1) {
	342	path="";
	343	} else {
	344	path = path.substring(0, i);
	345	}
	346	return path;
	347	}
	348
	349
	350	/** these are what we save on the stack */
	351	private class MyDocument
	352	{
	353	public Document doc = null;
	354	public String contents = null;
	355	public String tagname = "";
	356
	357	}
	358
	359
	360	protected void pushOnStack()
	361	{
	362	if (current_doc_ != null) {
	363	MyDocument save = new MyDocument();
	364	save.doc = current_doc_;
	365	save.contents = current_contents_;
	366	save.tagname = current_node_;
	367	stack_.push(save);
	368	}
	369	current_doc_ = new Document();
	370	current_contents_ = "";
	371	current_node_ = "";
	372	}
	373
	374	protected void popOffStack()
	375	{
	376	if (!stack_.empty()) {
	377	MyDocument saved = (MyDocument)stack_.pop();
	378	current_doc_ = saved.doc;
	379	current_contents_ = saved.contents;
	380	current_node_ = saved.tagname;
	381	} else {
	382	current_doc_ = new Document();
	383	current_contents_ = "";
	384	current_node_ = "";
	385	}
	386	}
	387	}
[8521]	388	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: