source: indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/Indexer.java@ 16282

Last change on this file since 16282 was 16282, checked in by mdewsnip, 16 years ago

Changed the "nodeID" field to be indexed in an un-tokenized way, because we need this field to be searchable in order to do deletions from the index.

  • Property svn:keywords set to Author Date Id Revision
File size: 7.2 KB
Line 
1/**********************************************************************
2 *
3 * Indexer.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26package org.greenstone.LuceneWrapper;
27
28
29import org.xml.sax.Attributes;
30import org.xml.sax.helpers.DefaultHandler;
31import org.xml.sax.InputSource;
32import org.xml.sax.SAXException;
33import org.xml.sax.XMLReader;
34
35import javax.xml.parsers.SAXParser;
36import javax.xml.parsers.SAXParserFactory;
37
38import org.apache.lucene.document.Document;
39import org.apache.lucene.document.Field;
40import org.apache.lucene.index.IndexWriter;
41import org.apache.lucene.analysis.standard.StandardAnalyzer;
42
43import java.util.Stack;
44import java.io.FileInputStream;
45import java.io.File;
46import java.io.StringReader;
47import java.net.URL;
48
49
50
51public class Indexer extends DefaultHandler
52{
53 IndexWriter writer_ = null;
54 SAXParser sax_parser_ = null;
55 String doc_tag_level_ = null;
56
57 Stack stack_ = null;
58 String path_ = "";
59
60 Document current_doc_ = null;
61 String current_node_ = "";
62 String indexable_current_node_ = "";
63 String current_contents_ = "";
64
65 protected String file_id_ = null;
66
67 /** pass in true if want to create a new index, false if want to use the existing one */
68 public Indexer (String doc_tag_level, File index_dir, boolean create)
69 {
70 doc_tag_level_ = doc_tag_level;
71
72 try {
73 stack_ = new Stack();
74 SAXParserFactory sax_factory = SAXParserFactory.newInstance();
75 sax_parser_ = sax_factory.newSAXParser();
76
77 XMLReader reader = sax_parser_.getXMLReader();
78 reader.setFeature("http://xml.org/sax/features/validation", false);
79
80 writer_ = new IndexWriter(index_dir.getPath(), new StandardAnalyzer(), create);
81 // by default, will only index 10,000 words per document
82 // Can throw out_of_memory errors
83 writer_.setMaxFieldLength(Integer.MAX_VALUE);
84 if (create) {
85 writer_.optimize();
86 }
87
88 } catch (Exception e) {
89 // do nothing!
90 }
91 }
92
93 /** index one document */
94 public void index (String file_id, File file)
95 {
96 file_id_ = file_id;
97 path_ = "";
98 String base_path = file.getPath();
99 base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
100
101 try {
102 sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
103 }
104 catch (Exception e) {
105 println("parse error:");
106 e.printStackTrace();
107 }
108 }
109
110 /** index one document stored as string*/
111 public void index (String xml_text)
112 {
113 file_id_ = "<xml doc on stdin>";
114 path_ = "";
115
116 try {
117 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
118 }
119 catch (Exception e) {
120 println("parse error:");
121 e.printStackTrace();
122 }
123 }
124
125 public void finish()
126 {
127 /** optimise the index */
128 try {
129 writer_.optimize();
130 writer_.close();
131 }
132 catch (Exception e) {
133 }
134 }
135
136 protected void print(String s)
137 {
138 System.out.print(s);
139 }
140
141 protected void println(String s)
142 {
143 System.out.println(s);
144 }
145
146 public void startDocument() throws SAXException
147 {
148 println("Starting to index " + file_id_);
149 print("[");
150 }
151
152 public void endDocument() throws SAXException
153 {
154 println("]");
155 println("... indexing finished.");
156 }
157
158 public void startElement(String uri, String localName, String qName, Attributes atts)
159 throws SAXException
160 {
161 path_ = appendPathLink(path_, qName, atts);
162
163 if (qName.equals(doc_tag_level_)) {
164 pushOnStack(); // start new doc
165 current_node_ = qName;
166 String node_id = atts.getValue("gs2:id");
167
168 print(" " + qName + ": " + node_id );
169 current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED));
170 }
171
172 if (XMLTagInfo.isIndexable(atts)) {
173 indexable_current_node_ = qName;
174 }
175 else {
176 indexable_current_node_ = "";
177 }
178
179 }
180 public void endElement(String uri, String localName, String qName) throws SAXException
181 {
182 if (qName.equals(indexable_current_node_))
183 {
184 current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
185 // We only need the term vector for the TX field
186 if (!qName.equals("TX"))
187 {
188 current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO));
189 }
190
191 current_contents_ = "";
192 }
193
194 if (qName.equals(doc_tag_level_)) {
195 try {
196 writer_.addDocument(current_doc_);
197 }
198 catch (java.io.IOException e) {
199 e.printStackTrace();
200 }
201 popOffStack(); // end document
202 }
203
204 path_ = removePathLink(path_);
205 }
206
207 public void characters(char ch[], int start, int length) throws SAXException
208 {
209 String data = new String(ch, start, length).trim();
210 if (data.length() > 0 ) {
211 current_contents_ += data;
212 }
213 }
214
215 protected String appendPathLink(String path, String qName, Attributes atts)
216 {
217
218 path = path + "/"+qName;
219 if (atts.getLength()>0) {
220 String id = atts.getValue("gs2:id");
221 if (id != null) {
222 path += "[@gs2:id='"+id+"']";
223 }
224 else {
225 id = atts.getValue("gs3:id");
226 if (id != null) {
227 path += "[@gs3:id='"+id+"']";
228 }
229 }
230 }
231 return path;
232 }
233 protected String removePathLink(String path)
234 {
235
236 int i=path.lastIndexOf('/');
237 if (i==-1) {
238 path="";
239 } else {
240 path = path.substring(0, i);
241 }
242 return path;
243 }
244 /** these are what we save on the stack */
245 private class MyDocument
246 {
247 public Document doc = null;
248 public String contents = null;
249 public String tagname = "";
250
251 }
252
253 protected void pushOnStack()
254 {
255 if (current_doc_ != null) {
256 MyDocument save = new MyDocument();
257 save.doc = current_doc_;
258 save.contents = current_contents_;
259 save.tagname = current_node_;
260 stack_.push(save);
261 }
262 current_doc_ = new Document();
263 current_contents_ = "";
264 current_node_ = "";
265 }
266
267 protected void popOffStack()
268 {
269 if (!stack_.empty()) {
270 MyDocument saved = (MyDocument)stack_.pop();
271 current_doc_ = saved.doc;
272 current_contents_ = saved.contents;
273 current_node_ = saved.tagname;
274 } else {
275 current_doc_ = new Document();
276 current_contents_ = "";
277 current_node_ = "";
278 }
279 }
280
281
282}
283
284
Note: See TracBrowser for help on using the repository browser.