Context Navigation

source: indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java@ 17804

Last change on this file since 17804 was 17804, checked in by davidb, 15 years ago
Introduction of GS2Analyzer, which overrides default behaviour of StandardAnalyzer to make accent folding of Latin-1 on
Property svn:keywords set to `Author Date Id Revision`
File size: 9.9 KB

Line
1	/**********************************************************************
2	*
3	* GS2LuceneIndexer.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26
27	package org.greenstone.LuceneWrapper;
28
29
30	import java.io.*;
31	import java.util.Vector;
32
33	import org.xml.sax.Attributes;
34	import org.xml.sax.helpers.DefaultHandler;
35	import org.xml.sax.InputSource;
36	import org.xml.sax.SAXException;
37	import org.xml.sax.XMLReader;
38
39	import javax.xml.parsers.SAXParser;
40	import javax.xml.parsers.SAXParserFactory;
41
42	import org.apache.lucene.document.Document;
43	import org.apache.lucene.document.Field;
44	import org.apache.lucene.index.IndexWriter;
45	import org.apache.lucene.index.Term;
46	import org.apache.lucene.analysis.Analyzer;
47
48	import java.util.Stack;
49	import java.io.FileInputStream;
50	import java.io.File;
51	import java.io.StringReader;
52	import java.net.URL;
53
54
55	/**
56	* class for indexing XML generated by lucenebuildproc.pm
57	*/
58
59	public class GS2LuceneIndexer {
60
61	public static void main (String args[]) throws Exception
62	{
63	int verbosity = 1;
64	// Default is to edit the existing index
65	boolean create_new_index = false;
66
67	Vector filtered_args = new Vector();
68
69	int argc = args.length;
70	int i = 0;
71	while (i<argc) {
72	if (args[i].startsWith("-")) {
73
74	// -removeold causes the existing index to be overwritten
75	if (args[i].equals("-removeold")) {
76	create_new_index = true;
77	}
78
79	// -verbosity [num]
80	else if (args[i].equals("-verbosity")) {
81	i++;
82	if (i<argc) {
83	verbosity = Integer.parseInt(args[i]);
84	}
85	}
86	else {
87	System.out.println("Unrecognised option: " + args[i]);
88	}
89	}
90	else {
91	filtered_args.add((Object)args[i]);
92	}
93	i++;
94	}
95
96	if (filtered_args.size() != 3) {
97	System.out.println("Usage: java GS2LuceneIndexer [-removeold\|-verbosity [num]] doc-tag-level building_dir index");
98	return;
99	}
100
101	String doc_tag_level = (String)filtered_args.get(0);
102	String building_dirname = (String)filtered_args.get(1);
103	String index_dirname = (String)filtered_args.get(2);
104
105	String import_dirname = building_dirname + File.separator + "text";
106
107	File import_dir = new File(import_dirname);
108	File building_dir = new File(building_dirname);
109
110	if (!import_dir.exists()) {
111	System.out.println("Couldn't find import directory: "+import_dirname);
112	return;
113	}
114
115	File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator);
116	idx_dir.mkdir();
117
118	// Set up indexer
119	Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index);
120
121	// Read from stdin the files to process
122	try {
123	InputStreamReader isr = new InputStreamReader(System.in, "UTF-8");
124	BufferedReader brin = new BufferedReader(isr);
125
126	StringBuffer xml_text = new StringBuffer(1024);
127	String line = null;
128	while ((line = brin.readLine()) != null) {
129	xml_text.append(line);
130	if (line.startsWith("</Doc>")) {
131	indexer.index(xml_text.toString());
132	xml_text = new StringBuffer(1024);
133	}
134	}
135
136	brin.close();
137	isr.close();
138
139	} catch (IOException e) {
140	System.err.println("Error: unable to read from stdin");
141	e.printStackTrace();
142	}
143
144	indexer.finish();
145	}
146
147
148	static public class Indexer extends DefaultHandler
149	{
150	IndexWriter writer_ = null;
151	Analyzer analyzer_ = null;
152	SAXParser sax_parser_ = null;
153	String doc_tag_level_ = null;
154
155	Stack stack_ = null;
156	String path_ = "";
157
158	Document current_doc_ = null;
159	String current_node_ = "";
160	String current_doc_oid_ = "";
161	String indexable_current_node_ = "";
162	String current_contents_ = "";
163
164	protected String file_id_ = null;
165
166	static private String[] stop_words = GS2Analyzer.STOP_WORDS;
167
168	/** pass in true if want to create a new index, false if want to use the existing one */
169	public Indexer (String doc_tag_level, File index_dir, boolean create)
170	{
171	doc_tag_level_ = doc_tag_level;
172
173	try {
174	stack_ = new Stack();
175	SAXParserFactory sax_factory = SAXParserFactory.newInstance();
176	sax_parser_ = sax_factory.newSAXParser();
177
178	XMLReader reader = sax_parser_.getXMLReader();
179	reader.setFeature("http://xml.org/sax/features/validation", false);
180
181	analyzer_ = new GS2Analyzer(stop_words);
182
183	writer_ = new IndexWriter(index_dir.getPath(), analyzer_, create);
184	// by default, will only index 10,000 words per document
185	// Can throw out_of_memory errors
186	writer_.setMaxFieldLength(Integer.MAX_VALUE);
187	if (create) {
188	writer_.optimize();
189	}
190	}
191	catch (Exception e) {
192	// We need to know if creating/opening the index fails
193	e.printStackTrace();
194	}
195	}
196
197	/** index one document */
198	public void index (String file_id, File file)
199	{
200	file_id_ = file_id;
201	path_ = "";
202	String base_path = file.getPath();
203	base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
204
205	try {
206	sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
207	}
208	catch (Exception e) {
209	println("parse error:");
210	e.printStackTrace();
211	}
212	}
213
214	/** index one document stored as string*/
215	public void index (String xml_text)
216	{
217	file_id_ = "<xml doc on stdin>";
218	path_ = "";
219
220	try {
221	sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
222	}
223	catch (Exception e) {
224	println("parse error:");
225	e.printStackTrace();
226	}
227	}
228
229	public void finish()
230	{
231	/** optimise the index */
232	try {
233	writer_.optimize();
234	writer_.close();
235	}
236	catch (Exception e) {
237	}
238	}
239
240	protected void print(String s)
241	{
242	System.out.print(s);
243	}
244
245	protected void println(String s)
246	{
247	System.out.println(s);
248	}
249
250	public void startDocument() throws SAXException
251	{
252	println("Starting to index " + file_id_);
253	print("[");
254	}
255
256	public void endDocument() throws SAXException
257	{
258	println("]");
259	println("... indexing finished.");
260	}
261
262	public void startElement(String uri, String localName, String qName, Attributes atts)
263	throws SAXException
264	{
265	path_ = appendPathLink(path_, qName, atts);
266
267	if (qName.equals(doc_tag_level_)) {
268	pushOnStack(); // start new doc
269	current_node_ = qName;
270
271	String node_id = atts.getValue("gs2:id");
272	print(" " + qName + ": " + node_id );
273	current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.TOKENIZED));
274
275	current_doc_oid_ = atts.getValue("gs2:docOID");
276	current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.TOKENIZED));
277	}
278
279	if (isIndexable(atts)) {
280	indexable_current_node_ = qName;
281	}
282	else {
283	indexable_current_node_ = "";
284	}
285	}
286
287	public static boolean isIndexable(Attributes atts)
288	{
289	boolean is_indexable = false;
290
291	String index = atts.getValue("index");
292	if (index!=null) {
293	if (index.equals("1")) {
294	is_indexable = true;
295	}
296	}
297	return is_indexable;
298	}
299
300	public void endElement(String uri, String localName, String qName) throws SAXException
301	{
302	if (qName.equals(indexable_current_node_))
303	{
304	current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
305	// We only need the term vector for the TX field
306	if (!qName.equals("TX"))
307	{
308	current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO));
309	}
310
311	current_contents_ = "";
312	}
313
314	if (qName.equals(doc_tag_level_)) {
315	try {
316	writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_, analyzer_);
317	}
318	catch (java.io.IOException e) {
319	e.printStackTrace();
320	}
321	popOffStack(); // end document
322	}
323
324	path_ = removePathLink(path_);
325	}
326
327	public void characters(char ch[], int start, int length) throws SAXException
328	{
329	String data = new String(ch, start, length).trim();
330	if (data.length() > 0 ) {
331	current_contents_ += data;
332	}
333	}
334
335	protected String appendPathLink(String path, String qName, Attributes atts)
336	{
337
338	path = path + "/"+qName;
339	if (atts.getLength()>0) {
340	String id = atts.getValue("gs2:id");
341	if (id != null) {
342	path += "[@gs2:id='"+id+"']";
343	}
344	else {
345	id = atts.getValue("gs3:id");
346	if (id != null) {
347	path += "[@gs3:id='"+id+"']";
348	}
349	}
350	}
351	return path;
352	}
353
354	protected String removePathLink(String path)
355	{
356
357	int i=path.lastIndexOf('/');
358	if (i==-1) {
359	path="";
360	} else {
361	path = path.substring(0, i);
362	}
363	return path;
364	}
365
366
367	/** these are what we save on the stack */
368	private class MyDocument
369	{
370	public Document doc = null;
371	public String contents = null;
372	public String tagname = "";
373
374	}
375
376
377	protected void pushOnStack()
378	{
379	if (current_doc_ != null) {
380	MyDocument save = new MyDocument();
381	save.doc = current_doc_;
382	save.contents = current_contents_;
383	save.tagname = current_node_;
384	stack_.push(save);
385	}
386	current_doc_ = new Document();
387	current_contents_ = "";
388	current_node_ = "";
389	}
390
391	protected void popOffStack()
392	{
393	if (!stack_.empty()) {
394	MyDocument saved = (MyDocument)stack_.pop();
395	current_doc_ = saved.doc;
396	current_contents_ = saved.contents;
397	current_node_ = saved.tagname;
398	} else {
399	current_doc_ = new Document();
400	current_contents_ = "";
401	current_node_ = "";
402	}
403	}
404	}
405	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: