Context Navigation

source: indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/Indexer.java@ 16282

Last change on this file since 16282 was 16282, checked in by mdewsnip, 16 years ago
Changed the "nodeID" field to be indexed in an un-tokenized way, because we need this field to be searchable in order to do deletions from the index.
Property svn:keywords set to `Author Date Id Revision`
File size: 7.2 KB

Line
1	/**********************************************************************
2	*
3	* Indexer.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26	package org.greenstone.LuceneWrapper;
27
28
29	import org.xml.sax.Attributes;
30	import org.xml.sax.helpers.DefaultHandler;
31	import org.xml.sax.InputSource;
32	import org.xml.sax.SAXException;
33	import org.xml.sax.XMLReader;
34
35	import javax.xml.parsers.SAXParser;
36	import javax.xml.parsers.SAXParserFactory;
37
38	import org.apache.lucene.document.Document;
39	import org.apache.lucene.document.Field;
40	import org.apache.lucene.index.IndexWriter;
41	import org.apache.lucene.analysis.standard.StandardAnalyzer;
42
43	import java.util.Stack;
44	import java.io.FileInputStream;
45	import java.io.File;
46	import java.io.StringReader;
47	import java.net.URL;
48
49
50
51	public class Indexer extends DefaultHandler
52	{
53	IndexWriter writer_ = null;
54	SAXParser sax_parser_ = null;
55	String doc_tag_level_ = null;
56
57	Stack stack_ = null;
58	String path_ = "";
59
60	Document current_doc_ = null;
61	String current_node_ = "";
62	String indexable_current_node_ = "";
63	String current_contents_ = "";
64
65	protected String file_id_ = null;
66
67	/** pass in true if want to create a new index, false if want to use the existing one */
68	public Indexer (String doc_tag_level, File index_dir, boolean create)
69	{
70	doc_tag_level_ = doc_tag_level;
71
72	try {
73	stack_ = new Stack();
74	SAXParserFactory sax_factory = SAXParserFactory.newInstance();
75	sax_parser_ = sax_factory.newSAXParser();
76
77	XMLReader reader = sax_parser_.getXMLReader();
78	reader.setFeature("http://xml.org/sax/features/validation", false);
79
80	writer_ = new IndexWriter(index_dir.getPath(), new StandardAnalyzer(), create);
81	// by default, will only index 10,000 words per document
82	// Can throw out_of_memory errors
83	writer_.setMaxFieldLength(Integer.MAX_VALUE);
84	if (create) {
85	writer_.optimize();
86	}
87
88	} catch (Exception e) {
89	// do nothing!
90	}
91	}
92
93	/** index one document */
94	public void index (String file_id, File file)
95	{
96	file_id_ = file_id;
97	path_ = "";
98	String base_path = file.getPath();
99	base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
100
101	try {
102	sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
103	}
104	catch (Exception e) {
105	println("parse error:");
106	e.printStackTrace();
107	}
108	}
109
110	/** index one document stored as string*/
111	public void index (String xml_text)
112	{
113	file_id_ = "<xml doc on stdin>";
114	path_ = "";
115
116	try {
117	sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
118	}
119	catch (Exception e) {
120	println("parse error:");
121	e.printStackTrace();
122	}
123	}
124
125	public void finish()
126	{
127	/** optimise the index */
128	try {
129	writer_.optimize();
130	writer_.close();
131	}
132	catch (Exception e) {
133	}
134	}
135
136	protected void print(String s)
137	{
138	System.out.print(s);
139	}
140
141	protected void println(String s)
142	{
143	System.out.println(s);
144	}
145
146	public void startDocument() throws SAXException
147	{
148	println("Starting to index " + file_id_);
149	print("[");
150	}
151
152	public void endDocument() throws SAXException
153	{
154	println("]");
155	println("... indexing finished.");
156	}
157
158	public void startElement(String uri, String localName, String qName, Attributes atts)
159	throws SAXException
160	{
161	path_ = appendPathLink(path_, qName, atts);
162
163	if (qName.equals(doc_tag_level_)) {
164	pushOnStack(); // start new doc
165	current_node_ = qName;
166	String node_id = atts.getValue("gs2:id");
167
168	print(" " + qName + ": " + node_id );
169	current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED));
170	}
171
172	if (XMLTagInfo.isIndexable(atts)) {
173	indexable_current_node_ = qName;
174	}
175	else {
176	indexable_current_node_ = "";
177	}
178
179	}
180	public void endElement(String uri, String localName, String qName) throws SAXException
181	{
182	if (qName.equals(indexable_current_node_))
183	{
184	current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
185	// We only need the term vector for the TX field
186	if (!qName.equals("TX"))
187	{
188	current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO));
189	}
190
191	current_contents_ = "";
192	}
193
194	if (qName.equals(doc_tag_level_)) {
195	try {
196	writer_.addDocument(current_doc_);
197	}
198	catch (java.io.IOException e) {
199	e.printStackTrace();
200	}
201	popOffStack(); // end document
202	}
203
204	path_ = removePathLink(path_);
205	}
206
207	public void characters(char ch[], int start, int length) throws SAXException
208	{
209	String data = new String(ch, start, length).trim();
210	if (data.length() > 0 ) {
211	current_contents_ += data;
212	}
213	}
214
215	protected String appendPathLink(String path, String qName, Attributes atts)
216	{
217
218	path = path + "/"+qName;
219	if (atts.getLength()>0) {
220	String id = atts.getValue("gs2:id");
221	if (id != null) {
222	path += "[@gs2:id='"+id+"']";
223	}
224	else {
225	id = atts.getValue("gs3:id");
226	if (id != null) {
227	path += "[@gs3:id='"+id+"']";
228	}
229	}
230	}
231	return path;
232	}
233	protected String removePathLink(String path)
234	{
235
236	int i=path.lastIndexOf('/');
237	if (i==-1) {
238	path="";
239	} else {
240	path = path.substring(0, i);
241	}
242	return path;
243	}
244	/** these are what we save on the stack */
245	private class MyDocument
246	{
247	public Document doc = null;
248	public String contents = null;
249	public String tagname = "";
250
251	}
252
253	protected void pushOnStack()
254	{
255	if (current_doc_ != null) {
256	MyDocument save = new MyDocument();
257	save.doc = current_doc_;
258	save.contents = current_contents_;
259	save.tagname = current_node_;
260	stack_.push(save);
261	}
262	current_doc_ = new Document();
263	current_contents_ = "";
264	current_node_ = "";
265	}
266
267	protected void popOffStack()
268	{
269	if (!stack_.empty()) {
270	MyDocument saved = (MyDocument)stack_.pop();
271	current_doc_ = saved.doc;
272	current_contents_ = saved.contents;
273	current_node_ = saved.tagname;
274	} else {
275	current_doc_ = new Document();
276	current_contents_ = "";
277	current_node_ = "";
278	}
279	}
280
281
282	}
283
284

Note: See TracBrowser for help on using the repository browser.

Download in other formats: