Context Navigation

source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/Indexer.java@ 13686

Last change on this file since 13686 was 13686, checked in by kjdon, 17 years ago
package has changed to org.greenstone.LuceneWrapper to be consistent with other indexer packages
Property svn:keywords set to `Author Date Id Revision`
File size: 7.2 KB

Line
1	/**********************************************************************
2	*
3	* Indexer.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26	package org.greenstone.LuceneWrapper;
27
28
29	import org.xml.sax.Attributes;
30	import org.xml.sax.helpers.DefaultHandler;
31	import org.xml.sax.InputSource;
32	import org.xml.sax.SAXException;
33	import org.xml.sax.XMLReader;
34
35	import javax.xml.parsers.SAXParser;
36	import javax.xml.parsers.SAXParserFactory;
37
38	import org.apache.lucene.document.Document;
39	import org.apache.lucene.document.Field;
40	import org.apache.lucene.index.IndexWriter;
41	import org.apache.lucene.analysis.standard.StandardAnalyzer;
42
43	import java.util.Stack;
44	import java.io.FileInputStream;
45	import java.io.File;
46	import java.io.StringReader;
47	import java.net.URL;
48
49
50
51	public class Indexer extends DefaultHandler
52	{
53	IndexWriter writer_ = null;
54	SAXParser sax_parser_ = null;
55	String doc_tag_level_ = null;
56
57	Stack stack_ = null;
58	String path_ = "";
59
60	Document current_doc_ = null;
61	String current_node_ = "";
62	String indexable_current_node_ = "";
63	String current_contents_ = "";
64
65	protected String file_id_ = null;
66
67	/** pass in true if want to create a new index, false if want to use the existing one */
68	public Indexer (String doc_tag_level, File index_dir, boolean create)
69	{
70	doc_tag_level_ = doc_tag_level;
71
72	try {
73	stack_ = new Stack();
74	SAXParserFactory sax_factory = SAXParserFactory.newInstance();
75	sax_parser_ = sax_factory.newSAXParser();
76
77	XMLReader reader = sax_parser_.getXMLReader();
78	reader.setFeature("http://xml.org/sax/features/validation", false);
79
80	writer_ = new IndexWriter(index_dir.getPath(), new StandardAnalyzer(), create);
81	// by default, will only index 10,000 words per document
82	// Can throw out_of_memory errors
83	writer_.setMaxFieldLength(Integer.MAX_VALUE);
84	if (create) {
85	writer_.optimize();
86	}
87
88	} catch (Exception e) {
89	// do nothing!
90	}
91	}
92
93	/** index one document */
94	public void index (String file_id, File file)
95	{
96	file_id_ = file_id;
97	path_ = "";
98	String base_path = file.getPath();
99	base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
100
101	try {
102	sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
103	}
104	catch (Exception e) {
105	println("parse error:");
106	e.printStackTrace();
107	}
108	}
109
110	/** index one document stored as string*/
111	public void index (String xml_text)
112	{
113	file_id_ = "<xml doc on stdin>";
114	path_ = "";
115
116	try {
117	sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
118	}
119	catch (Exception e) {
120	println("parse error:");
121	e.printStackTrace();
122	}
123	}
124
125	public void finish()
126	{
127	/** optimise the index */
128	try {
129	writer_.optimize();
130	writer_.close();
131	}
132	catch (Exception e) {
133	}
134	}
135
136	protected void print(String s)
137	{
138	System.out.print(s);
139	}
140
141	protected void println(String s)
142	{
143	System.out.println(s);
144	}
145
146	public void startDocument() throws SAXException
147	{
148	println("Starting to index " + file_id_);
149	print("[");
150	}
151
152	public void endDocument() throws SAXException
153	{
154	println("]");
155	println("... indexing finished.");
156	}
157
158	public void startElement(String uri, String localName, String qName, Attributes atts)
159	throws SAXException
160	{
161	path_ = appendPathLink(path_, qName, atts);
162
163	if (qName.equals(doc_tag_level_)) {
164	pushOnStack(); // start new doc
165	current_node_ = qName;
166	String node_id = atts.getValue("gs2:id");
167
168	print(" " + qName + ": " + node_id );
169	current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.NO));
170	}
171
172	if (XMLTagInfo.isIndexable(atts)) {
173	indexable_current_node_ = qName;
174	}
175	else {
176	indexable_current_node_ = "";
177	}
178
179	}
180	public void endElement(String uri, String localName, String qName) throws SAXException
181	{
182	if (qName.equals(indexable_current_node_))
183	{
184	current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
185	// We only need the term vector for the TX field
186	if (!qName.equals("TX"))
187	{
188	current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO));
189	}
190
191	current_contents_ = "";
192	}
193
194	if (qName.equals(doc_tag_level_)) {
195	try {
196	writer_.addDocument(current_doc_);
197	}
198	catch (java.io.IOException e) {
199	e.printStackTrace();
200	}
201	popOffStack(); // end document
202	}
203
204	path_ = removePathLink(path_);
205	}
206
207	public void characters(char ch[], int start, int length) throws SAXException
208	{
209	String data = new String(ch, start, length).trim();
210	if (data.length() > 0 ) {
211	current_contents_ += data;
212	}
213	}
214
215	protected String appendPathLink(String path, String qName, Attributes atts)
216	{
217
218	path = path + "/"+qName;
219	if (atts.getLength()>0) {
220	String id = atts.getValue("gs2:id");
221	if (id != null) {
222	path += "[@gs2:id='"+id+"']";
223	}
224	else {
225	id = atts.getValue("gs3:id");
226	if (id != null) {
227	path += "[@gs3:id='"+id+"']";
228	}
229	}
230	}
231	return path;
232	}
233	protected String removePathLink(String path)
234	{
235
236	int i=path.lastIndexOf('/');
237	if (i==-1) {
238	path="";
239	} else {
240	path = path.substring(0, i);
241	}
242	return path;
243	}
244	/** these are what we save on the stack */
245	private class MyDocument
246	{
247	public Document doc = null;
248	public String contents = null;
249	public String tagname = "";
250
251	}
252
253	protected void pushOnStack()
254	{
255	if (current_doc_ != null) {
256	MyDocument save = new MyDocument();
257	save.doc = current_doc_;
258	save.contents = current_contents_;
259	save.tagname = current_node_;
260	stack_.push(save);
261	}
262	current_doc_ = new Document();
263	current_contents_ = "";
264	current_node_ = "";
265	}
266
267	protected void popOffStack()
268	{
269	if (!stack_.empty()) {
270	MyDocument saved = (MyDocument)stack_.pop();
271	current_doc_ = saved.doc;
272	current_contents_ = saved.contents;
273	current_node_ = saved.tagname;
274	} else {
275	current_doc_ = new Document();
276	current_contents_ = "";
277	current_node_ = "";
278	}
279	}
280
281
282	}
283
284

Note: See TracBrowser for help on using the repository browser.

Download in other formats: