Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: gsdl/trunk/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago
Undoing change commited in r16582
Property svn:keywords set to `Author Date Id Revision`
File size: 9.8 KB

Line
1	/**********************************************************************
2	*
3	* GS2LuceneIndexer.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26
27	package org.greenstone.LuceneWrapper;
28
29
30	import java.io.*;
31	import java.util.Vector;
32
33	import org.xml.sax.Attributes;
34	import org.xml.sax.helpers.DefaultHandler;
35	import org.xml.sax.InputSource;
36	import org.xml.sax.SAXException;
37	import org.xml.sax.XMLReader;
38
39	import javax.xml.parsers.SAXParser;
40	import javax.xml.parsers.SAXParserFactory;
41
42	import org.apache.lucene.document.Document;
43	import org.apache.lucene.document.Field;
44	import org.apache.lucene.index.IndexWriter;
45	import org.apache.lucene.index.Term;
46	import org.apache.lucene.analysis.standard.StandardAnalyzer;
47
48	import java.util.Stack;
49	import java.io.FileInputStream;
50	import java.io.File;
51	import java.io.StringReader;
52	import java.net.URL;
53
54
55	/**
56	* class for indexing XML generated by lucenebuildproc.pm
57	*/
58
59	public class GS2LuceneIndexer {
60
61	public static void main (String args[]) throws Exception
62	{
63
64	int verbosity = 1;
65	// Default is to edit the existing index
66	boolean create_new_index = false;
67
68	Vector filtered_args = new Vector();
69
70	int argc = args.length;
71	int i = 0;
72	while (i<argc) {
73	if (args[i].startsWith("-")) {
74
75	// -removeold causes the existing index to be overwritten
76	if (args[i].equals("-removeold")) {
77	create_new_index = true;
78	}
79
80	// -verbosity [num]
81	else if (args[i].equals("-verbosity")) {
82	i++;
83	if (i<argc) {
84	verbosity = Integer.parseInt(args[i]);
85	}
86	}
87	else {
88	System.out.println("Unrecognised option: " + args[i]);
89	}
90	}
91	else {
92	filtered_args.add((Object)args[i]);
93	}
94	i++;
95	}
96
97	if (filtered_args.size() != 3) {
98	System.out.println("Usage: java GS2LuceneIndexer [-removeold\|-verbosity [num]] doc-tag-level building_dir index");
99	return;
100	}
101
102	String doc_tag_level = (String)filtered_args.get(0);
103	String building_dirname = (String)filtered_args.get(1);
104	String index_dirname = (String)filtered_args.get(2);
105
106	String import_dirname = building_dirname + File.separator + "text";
107
108	File import_dir = new File(import_dirname);
109	File building_dir = new File(building_dirname);
110
111	if (!import_dir.exists()) {
112	System.out.println("Couldn't find import directory: "+import_dirname);
113	return;
114	}
115
116	File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator);
117	idx_dir.mkdir();
118
119	// Set up indexer
120	Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index);
121
122	// Read from stdin the files to process
123	try {
124	InputStreamReader isr = new InputStreamReader(System.in, "UTF-8");
125	BufferedReader brin = new BufferedReader(isr);
126
127	StringBuffer xml_text = new StringBuffer(1024);
128	String line = null;
129	while ((line = brin.readLine()) != null) {
130	xml_text.append(line);
131	if (line.startsWith("</Doc>")) {
132	indexer.index(xml_text.toString());
133	xml_text = new StringBuffer(1024);
134	}
135	}
136
137	brin.close();
138	isr.close();
139
140	} catch (IOException e) {
141	System.err.println("Error: unable to read from stdin");
142	e.printStackTrace();
143	}
144
145	indexer.finish();
146	}
147
148
149	static public class Indexer extends DefaultHandler
150	{
151	IndexWriter writer_ = null;
152	SAXParser sax_parser_ = null;
153	String doc_tag_level_ = null;
154
155	Stack stack_ = null;
156	String path_ = "";
157
158	Document current_doc_ = null;
159	String current_node_ = "";
160	String current_doc_oid_ = "";
161	String indexable_current_node_ = "";
162	String current_contents_ = "";
163
164	protected String file_id_ = null;
165
166	/** pass in true if want to create a new index, false if want to use the existing one */
167	public Indexer (String doc_tag_level, File index_dir, boolean create)
168	{
169	doc_tag_level_ = doc_tag_level;
170
171	try {
172	stack_ = new Stack();
173	SAXParserFactory sax_factory = SAXParserFactory.newInstance();
174	sax_parser_ = sax_factory.newSAXParser();
175
176	XMLReader reader = sax_parser_.getXMLReader();
177	reader.setFeature("http://xml.org/sax/features/validation", false);
178
179	writer_ = new IndexWriter(index_dir.getPath(), new StandardAnalyzer(), create);
180	// by default, will only index 10,000 words per document
181	// Can throw out_of_memory errors
182	writer_.setMaxFieldLength(Integer.MAX_VALUE);
183	if (create) {
184	writer_.optimize();
185	}
186	}
187	catch (Exception e) {
188	// We need to know if creating/opening the index fails
189	e.printStackTrace();
190	}
191	}
192
193	/** index one document */
194	public void index (String file_id, File file)
195	{
196	file_id_ = file_id;
197	path_ = "";
198	String base_path = file.getPath();
199	base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
200
201	try {
202	sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
203	}
204	catch (Exception e) {
205	println("parse error:");
206	e.printStackTrace();
207	}
208	}
209
210	/** index one document stored as string*/
211	public void index (String xml_text)
212	{
213	file_id_ = "<xml doc on stdin>";
214	path_ = "";
215
216	try {
217	sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
218	}
219	catch (Exception e) {
220	println("parse error:");
221	e.printStackTrace();
222	}
223	}
224
225	public void finish()
226	{
227	/** optimise the index */
228	try {
229	writer_.optimize();
230	writer_.close();
231	}
232	catch (Exception e) {
233	}
234	}
235
236	protected void print(String s)
237	{
238	System.out.print(s);
239	}
240
241	protected void println(String s)
242	{
243	System.out.println(s);
244	}
245
246	public void startDocument() throws SAXException
247	{
248	println("Starting to index " + file_id_);
249	print("[");
250	}
251
252	public void endDocument() throws SAXException
253	{
254	println("]");
255	println("... indexing finished.");
256	}
257
258	public void startElement(String uri, String localName, String qName, Attributes atts)
259	throws SAXException
260	{
261	path_ = appendPathLink(path_, qName, atts);
262
263	if (qName.equals(doc_tag_level_)) {
264	pushOnStack(); // start new doc
265	current_node_ = qName;
266
267	String node_id = atts.getValue("gs2:id");
268	print(" " + qName + ": " + node_id );
269	current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED));
270
271	current_doc_oid_ = atts.getValue("gs2:docOID");
272	current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.UN_TOKENIZED));
273	}
274
275	if (isIndexable(atts)) {
276	indexable_current_node_ = qName;
277	}
278	else {
279	indexable_current_node_ = "";
280	}
281	}
282
283	public static boolean isIndexable(Attributes atts)
284	{
285	boolean is_indexable = false;
286
287	String index = atts.getValue("index");
288	if (index!=null) {
289	if (index.equals("1")) {
290	is_indexable = true;
291	}
292	}
293	return is_indexable;
294	}
295
296	public void endElement(String uri, String localName, String qName) throws SAXException
297	{
298	if (qName.equals(indexable_current_node_))
299	{
300	current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
301	// We only need the term vector for the TX field
302	if (!qName.equals("TX"))
303	{
304	current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO));
305	}
306
307	current_contents_ = "";
308	}
309
310	if (qName.equals(doc_tag_level_)) {
311	try {
312	writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_);
313	}
314	catch (java.io.IOException e) {
315	e.printStackTrace();
316	}
317	popOffStack(); // end document
318	}
319
320	path_ = removePathLink(path_);
321	}
322
323	public void characters(char ch[], int start, int length) throws SAXException
324	{
325	String data = new String(ch, start, length).trim();
326	if (data.length() > 0 ) {
327	current_contents_ += data;
328	}
329	}
330
331	protected String appendPathLink(String path, String qName, Attributes atts)
332	{
333
334	path = path + "/"+qName;
335	if (atts.getLength()>0) {
336	String id = atts.getValue("gs2:id");
337	if (id != null) {
338	path += "[@gs2:id='"+id+"']";
339	}
340	else {
341	id = atts.getValue("gs3:id");
342	if (id != null) {
343	path += "[@gs3:id='"+id+"']";
344	}
345	}
346	}
347	return path;
348	}
349
350	protected String removePathLink(String path)
351	{
352
353	int i=path.lastIndexOf('/');
354	if (i==-1) {
355	path="";
356	} else {
357	path = path.substring(0, i);
358	}
359	return path;
360	}
361
362
363	/** these are what we save on the stack */
364	private class MyDocument
365	{
366	public Document doc = null;
367	public String contents = null;
368	public String tagname = "";
369
370	}
371
372
373	protected void pushOnStack()
374	{
375	if (current_doc_ != null) {
376	MyDocument save = new MyDocument();
377	save.doc = current_doc_;
378	save.contents = current_contents_;
379	save.tagname = current_node_;
380	stack_.push(save);
381	}
382	current_doc_ = new Document();
383	current_contents_ = "";
384	current_node_ = "";
385	}
386
387	protected void popOffStack()
388	{
389	if (!stack_.empty()) {
390	MyDocument saved = (MyDocument)stack_.pop();
391	current_doc_ = saved.doc;
392	current_contents_ = saved.contents;
393	current_node_ = saved.tagname;
394	} else {
395	current_doc_ = new Document();
396	current_contents_ = "";
397	current_node_ = "";
398	}
399	}
400	}
401	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: