Context Navigation

source: indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java@ 16437

Last change on this file since 16437 was 16437, checked in by mdewsnip, 16 years ago
Now stores the new "docOID" value in the index, to help support incremental building.
Property svn:keywords set to `Author Date Id Revision`
File size: 9.4 KB

Line
1	/**********************************************************************
2	*
3	* GS2LuceneIndexer.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26
27	package org.greenstone.LuceneWrapper;
28
29
30	import java.io.*;
31	import java.util.Vector;
32
33	import org.xml.sax.Attributes;
34	import org.xml.sax.helpers.DefaultHandler;
35	import org.xml.sax.InputSource;
36	import org.xml.sax.SAXException;
37	import org.xml.sax.XMLReader;
38
39	import javax.xml.parsers.SAXParser;
40	import javax.xml.parsers.SAXParserFactory;
41
42	import org.apache.lucene.document.Document;
43	import org.apache.lucene.document.Field;
44	import org.apache.lucene.index.IndexWriter;
45	import org.apache.lucene.analysis.standard.StandardAnalyzer;
46
47	import java.util.Stack;
48	import java.io.FileInputStream;
49	import java.io.File;
50	import java.io.StringReader;
51	import java.net.URL;
52
53
54	/**
55	* class for indexing XML generated by lucenebuildproc.pm
56	*/
57
58	public class GS2LuceneIndexer {
59
60	public static void main (String args[]) throws Exception
61	{
62
63	int verbosity = 1;
64	// Default is to edit the existing index
65	boolean create_new_index = false;
66
67	Vector filtered_args = new Vector();
68
69	int argc = args.length;
70	int i = 0;
71	while (i<argc) {
72	if (args[i].startsWith("-")) {
73
74	// -removeold causes the existing index to be overwritten
75	if (args[i].equals("-removeold")) {
76	create_new_index = true;
77	}
78
79	// -verbosity [num]
80	else if (args[i].equals("-verbosity")) {
81	i++;
82	if (i<argc) {
83	verbosity = Integer.parseInt(args[i]);
84	}
85	}
86	else {
87	System.out.println("Unrecognised option: " + args[i]);
88	}
89	}
90	else {
91	filtered_args.add((Object)args[i]);
92	}
93	i++;
94	}
95
96	if (filtered_args.size() != 3) {
97	System.out.println("Usage: java GS2LuceneIndexer [-removeold\|-verbosity [num]] doc-tag-level building_dir index");
98	return;
99	}
100
101	String doc_tag_level = (String)filtered_args.get(0);
102	String building_dirname = (String)filtered_args.get(1);
103	String index_dirname = (String)filtered_args.get(2);
104
105	String import_dirname = building_dirname + File.separator + "text";
106
107	File import_dir = new File(import_dirname);
108	File building_dir = new File(building_dirname);
109
110	if (!import_dir.exists()) {
111	System.out.println("Couldn't find import directory: "+import_dirname);
112	return;
113	}
114
115	File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator);
116	idx_dir.mkdir();
117
118	// Set up indexer
119	Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index);
120
121	// Read from stdin the files to process
122	try {
123	InputStreamReader isr = new InputStreamReader(System.in, "UTF-8");
124	BufferedReader brin = new BufferedReader(isr);
125
126	StringBuffer xml_text = new StringBuffer(1024);
127	String line = null;
128	while ((line = brin.readLine()) != null) {
129	xml_text.append(line);
130	if (line.startsWith("</Doc>")) {
131	indexer.index(xml_text.toString());
132	xml_text = new StringBuffer(1024);
133	}
134	}
135
136	brin.close();
137	isr.close();
138
139	} catch (IOException e) {
140	System.err.println("Error: unable to read from stdin");
141	e.printStackTrace();
142	}
143
144	indexer.finish();
145	}
146
147
148	static public class Indexer extends DefaultHandler
149	{
150	IndexWriter writer_ = null;
151	SAXParser sax_parser_ = null;
152	String doc_tag_level_ = null;
153
154	Stack stack_ = null;
155	String path_ = "";
156
157	Document current_doc_ = null;
158	String current_node_ = "";
159	String indexable_current_node_ = "";
160	String current_contents_ = "";
161
162	protected String file_id_ = null;
163
164	/** pass in true if want to create a new index, false if want to use the existing one */
165	public Indexer (String doc_tag_level, File index_dir, boolean create)
166	{
167	doc_tag_level_ = doc_tag_level;
168
169	try {
170	stack_ = new Stack();
171	SAXParserFactory sax_factory = SAXParserFactory.newInstance();
172	sax_parser_ = sax_factory.newSAXParser();
173
174	XMLReader reader = sax_parser_.getXMLReader();
175	reader.setFeature("http://xml.org/sax/features/validation", false);
176
177	writer_ = new IndexWriter(index_dir.getPath(), new StandardAnalyzer(), create);
178	// by default, will only index 10,000 words per document
179	// Can throw out_of_memory errors
180	writer_.setMaxFieldLength(Integer.MAX_VALUE);
181	if (create) {
182	writer_.optimize();
183	}
184
185	} catch (Exception e) {
186	// do nothing!
187	}
188	}
189
190	/** index one document */
191	public void index (String file_id, File file)
192	{
193	file_id_ = file_id;
194	path_ = "";
195	String base_path = file.getPath();
196	base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
197
198	try {
199	sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
200	}
201	catch (Exception e) {
202	println("parse error:");
203	e.printStackTrace();
204	}
205	}
206
207	/** index one document stored as string*/
208	public void index (String xml_text)
209	{
210	file_id_ = "<xml doc on stdin>";
211	path_ = "";
212
213	try {
214	sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
215	}
216	catch (Exception e) {
217	println("parse error:");
218	e.printStackTrace();
219	}
220	}
221
222	public void finish()
223	{
224	/** optimise the index */
225	try {
226	writer_.optimize();
227	writer_.close();
228	}
229	catch (Exception e) {
230	}
231	}
232
233	protected void print(String s)
234	{
235	System.out.print(s);
236	}
237
238	protected void println(String s)
239	{
240	System.out.println(s);
241	}
242
243	public void startDocument() throws SAXException
244	{
245	println("Starting to index " + file_id_);
246	print("[");
247	}
248
249	public void endDocument() throws SAXException
250	{
251	println("]");
252	println("... indexing finished.");
253	}
254
255	public void startElement(String uri, String localName, String qName, Attributes atts)
256	throws SAXException
257	{
258	path_ = appendPathLink(path_, qName, atts);
259
260	if (qName.equals(doc_tag_level_)) {
261	pushOnStack(); // start new doc
262	current_node_ = qName;
263
264	String node_id = atts.getValue("gs2:id");
265	print(" " + qName + ": " + node_id );
266	current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED));
267
268	String current_doc_oid_ = atts.getValue("gs2:docOID");
269	current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.UN_TOKENIZED));
270	}
271
272	if (XMLTagInfo.isIndexable(atts)) {
273	indexable_current_node_ = qName;
274	}
275	else {
276	indexable_current_node_ = "";
277	}
278
279	}
280
281	public void endElement(String uri, String localName, String qName) throws SAXException
282	{
283	if (qName.equals(indexable_current_node_))
284	{
285	current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
286	// We only need the term vector for the TX field
287	if (!qName.equals("TX"))
288	{
289	current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO));
290	}
291
292	current_contents_ = "";
293	}
294
295	if (qName.equals(doc_tag_level_)) {
296	try {
297	writer_.addDocument(current_doc_);
298	}
299	catch (java.io.IOException e) {
300	e.printStackTrace();
301	}
302	popOffStack(); // end document
303	}
304
305	path_ = removePathLink(path_);
306	}
307
308	public void characters(char ch[], int start, int length) throws SAXException
309	{
310	String data = new String(ch, start, length).trim();
311	if (data.length() > 0 ) {
312	current_contents_ += data;
313	}
314	}
315
316	protected String appendPathLink(String path, String qName, Attributes atts)
317	{
318
319	path = path + "/"+qName;
320	if (atts.getLength()>0) {
321	String id = atts.getValue("gs2:id");
322	if (id != null) {
323	path += "[@gs2:id='"+id+"']";
324	}
325	else {
326	id = atts.getValue("gs3:id");
327	if (id != null) {
328	path += "[@gs3:id='"+id+"']";
329	}
330	}
331	}
332	return path;
333	}
334
335	protected String removePathLink(String path)
336	{
337
338	int i=path.lastIndexOf('/');
339	if (i==-1) {
340	path="";
341	} else {
342	path = path.substring(0, i);
343	}
344	return path;
345	}
346
347
348	/** these are what we save on the stack */
349	private class MyDocument
350	{
351	public Document doc = null;
352	public String contents = null;
353	public String tagname = "";
354
355	}
356
357
358	protected void pushOnStack()
359	{
360	if (current_doc_ != null) {
361	MyDocument save = new MyDocument();
362	save.doc = current_doc_;
363	save.contents = current_contents_;
364	save.tagname = current_node_;
365	stack_.push(save);
366	}
367	current_doc_ = new Document();
368	current_contents_ = "";
369	current_node_ = "";
370	}
371
372	protected void popOffStack()
373	{
374	if (!stack_.empty()) {
375	MyDocument saved = (MyDocument)stack_.pop();
376	current_doc_ = saved.doc;
377	current_contents_ = saved.contents;
378	current_node_ = saved.tagname;
379	} else {
380	current_doc_ = new Document();
381	current_contents_ = "";
382	current_node_ = "";
383	}
384	}
385	}
386	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: