source: indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java@ 16438

Last change on this file since 16438 was 16438, checked in by mdewsnip, 16 years ago

Now uses Lucene 2.3.2's updateDocument() function instead of addDocument(), in preparation for implementing incremental building.

  • Property svn:keywords set to Author Date Id Revision
File size: 9.5 KB
RevLine 
[13555]1/**********************************************************************
[8521]2 *
[13555]3 * GS2LuceneIndexer.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
[8521]26
[13686]27package org.greenstone.LuceneWrapper;
[8521]28
[12257]29
[8521]30import java.io.*;
[10164]31import java.util.Vector;
[8521]32
[16432]33import org.xml.sax.Attributes;
34import org.xml.sax.helpers.DefaultHandler;
35import org.xml.sax.InputSource;
36import org.xml.sax.SAXException;
37import org.xml.sax.XMLReader;
38
39import javax.xml.parsers.SAXParser;
40import javax.xml.parsers.SAXParserFactory;
41
42import org.apache.lucene.document.Document;
43import org.apache.lucene.document.Field;
44import org.apache.lucene.index.IndexWriter;
[16438]45import org.apache.lucene.index.Term;
[16432]46import org.apache.lucene.analysis.standard.StandardAnalyzer;
47
48import java.util.Stack;
49import java.io.FileInputStream;
50import java.io.File;
51import java.io.StringReader;
52import java.net.URL;
53
54
[13555]55/**
56 * class for indexing XML generated by lucenebuildproc.pm
57 */
58
[8521]59public class GS2LuceneIndexer {
60
[10164]61 public static void main (String args[]) throws Exception
62 {
63
64 int verbosity = 1;
[16262]65 // Default is to edit the existing index
66 boolean create_new_index = false;
[10164]67
68 Vector filtered_args = new Vector();
69
70 int argc = args.length;
71 int i = 0;
72 while (i<argc) {
73 if (args[i].startsWith("-")) {
74
[16262]75 // -removeold causes the existing index to be overwritten
76 if (args[i].equals("-removeold")) {
77 create_new_index = true;
[10164]78 }
79
80 // -verbosity [num]
81 else if (args[i].equals("-verbosity")) {
82 i++;
83 if (i<argc) {
84 verbosity = Integer.parseInt(args[i]);
85 }
86 }
87 else {
88 System.out.println("Unrecognised option: " + args[i]);
89 }
90 }
91 else {
92 filtered_args.add((Object)args[i]);
93 }
94 i++;
95 }
96
97 if (filtered_args.size() != 3) {
[16262]98 System.out.println("Usage: java GS2LuceneIndexer [-removeold|-verbosity [num]] doc-tag-level building_dir index");
[8521]99 return;
100 }
101
[10164]102 String doc_tag_level = (String)filtered_args.get(0);
103 String building_dirname = (String)filtered_args.get(1);
104 String index_dirname = (String)filtered_args.get(2);
[8521]105
106 String import_dirname = building_dirname + File.separator + "text";
107
108 File import_dir = new File(import_dirname);
109 File building_dir = new File(building_dirname);
110
111 if (!import_dir.exists()) {
112 System.out.println("Couldn't find import directory: "+import_dirname);
113 return;
114 }
115
116 File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator);
117 idx_dir.mkdir();
118
119 // Set up indexer
[16430]120 Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index);
[8521]121
122 // Read from stdin the files to process
123 try {
[9988]124 InputStreamReader isr = new InputStreamReader(System.in, "UTF-8");
[8521]125 BufferedReader brin = new BufferedReader(isr);
126
127 StringBuffer xml_text = new StringBuffer(1024);
128 String line = null;
129 while ((line = brin.readLine()) != null) {
130 xml_text.append(line);
131 if (line.startsWith("</Doc>")) {
[16430]132 indexer.index(xml_text.toString());
[8521]133 xml_text = new StringBuffer(1024);
134 }
135 }
136
137 brin.close();
138 isr.close();
139
140 } catch (IOException e) {
141 System.err.println("Error: unable to read from stdin");
142 e.printStackTrace();
143 }
144
145 indexer.finish();
146 }
[16432]147
148
149 static public class Indexer extends DefaultHandler
150 {
151 IndexWriter writer_ = null;
152 SAXParser sax_parser_ = null;
153 String doc_tag_level_ = null;
154
155 Stack stack_ = null;
156 String path_ = "";
157
158 Document current_doc_ = null;
159 String current_node_ = "";
[16438]160 String current_doc_oid_ = "";
[16432]161 String indexable_current_node_ = "";
162 String current_contents_ = "";
163
164 protected String file_id_ = null;
165
166 /** pass in true if want to create a new index, false if want to use the existing one */
167 public Indexer (String doc_tag_level, File index_dir, boolean create)
168 {
169 doc_tag_level_ = doc_tag_level;
170
171 try {
172 stack_ = new Stack();
173 SAXParserFactory sax_factory = SAXParserFactory.newInstance();
174 sax_parser_ = sax_factory.newSAXParser();
175
176 XMLReader reader = sax_parser_.getXMLReader();
177 reader.setFeature("http://xml.org/sax/features/validation", false);
178
179 writer_ = new IndexWriter(index_dir.getPath(), new StandardAnalyzer(), create);
180 // by default, will only index 10,000 words per document
181 // Can throw out_of_memory errors
182 writer_.setMaxFieldLength(Integer.MAX_VALUE);
183 if (create) {
184 writer_.optimize();
185 }
186
187 } catch (Exception e) {
188 // do nothing!
189 }
190 }
191
192 /** index one document */
193 public void index (String file_id, File file)
194 {
195 file_id_ = file_id;
196 path_ = "";
197 String base_path = file.getPath();
198 base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
199
200 try {
201 sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
202 }
203 catch (Exception e) {
204 println("parse error:");
205 e.printStackTrace();
206 }
207 }
208
209 /** index one document stored as string*/
210 public void index (String xml_text)
211 {
212 file_id_ = "<xml doc on stdin>";
213 path_ = "";
214
215 try {
216 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
217 }
218 catch (Exception e) {
219 println("parse error:");
220 e.printStackTrace();
221 }
222 }
223
224 public void finish()
225 {
226 /** optimise the index */
227 try {
228 writer_.optimize();
229 writer_.close();
230 }
231 catch (Exception e) {
232 }
233 }
234
235 protected void print(String s)
236 {
237 System.out.print(s);
238 }
239
240 protected void println(String s)
241 {
242 System.out.println(s);
243 }
244
245 public void startDocument() throws SAXException
246 {
247 println("Starting to index " + file_id_);
248 print("[");
249 }
250
251 public void endDocument() throws SAXException
252 {
253 println("]");
254 println("... indexing finished.");
255 }
256
257 public void startElement(String uri, String localName, String qName, Attributes atts)
258 throws SAXException
259 {
260 path_ = appendPathLink(path_, qName, atts);
261
262 if (qName.equals(doc_tag_level_)) {
263 pushOnStack(); // start new doc
264 current_node_ = qName;
[16437]265
[16432]266 String node_id = atts.getValue("gs2:id");
267 print(" " + qName + ": " + node_id );
268 current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED));
[16437]269
[16438]270 current_doc_oid_ = atts.getValue("gs2:docOID");
[16437]271 current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.UN_TOKENIZED));
[16432]272 }
273
274 if (XMLTagInfo.isIndexable(atts)) {
275 indexable_current_node_ = qName;
276 }
277 else {
278 indexable_current_node_ = "";
279 }
280
281 }
282
283 public void endElement(String uri, String localName, String qName) throws SAXException
284 {
285 if (qName.equals(indexable_current_node_))
286 {
287 current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
288 // We only need the term vector for the TX field
289 if (!qName.equals("TX"))
290 {
291 current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO));
292 }
293
294 current_contents_ = "";
295 }
296
297 if (qName.equals(doc_tag_level_)) {
298 try {
[16438]299 writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_);
[16432]300 }
301 catch (java.io.IOException e) {
302 e.printStackTrace();
303 }
304 popOffStack(); // end document
305 }
306
307 path_ = removePathLink(path_);
308 }
309
310 public void characters(char ch[], int start, int length) throws SAXException
311 {
312 String data = new String(ch, start, length).trim();
313 if (data.length() > 0 ) {
314 current_contents_ += data;
315 }
316 }
317
318 protected String appendPathLink(String path, String qName, Attributes atts)
319 {
320
321 path = path + "/"+qName;
322 if (atts.getLength()>0) {
323 String id = atts.getValue("gs2:id");
324 if (id != null) {
325 path += "[@gs2:id='"+id+"']";
326 }
327 else {
328 id = atts.getValue("gs3:id");
329 if (id != null) {
330 path += "[@gs3:id='"+id+"']";
331 }
332 }
333 }
334 return path;
335 }
336
337 protected String removePathLink(String path)
338 {
339
340 int i=path.lastIndexOf('/');
341 if (i==-1) {
342 path="";
343 } else {
344 path = path.substring(0, i);
345 }
346 return path;
347 }
348
349
350 /** these are what we save on the stack */
351 private class MyDocument
352 {
353 public Document doc = null;
354 public String contents = null;
355 public String tagname = "";
356
357 }
358
359
360 protected void pushOnStack()
361 {
362 if (current_doc_ != null) {
363 MyDocument save = new MyDocument();
364 save.doc = current_doc_;
365 save.contents = current_contents_;
366 save.tagname = current_node_;
367 stack_.push(save);
368 }
369 current_doc_ = new Document();
370 current_contents_ = "";
371 current_node_ = "";
372 }
373
374 protected void popOffStack()
375 {
376 if (!stack_.empty()) {
377 MyDocument saved = (MyDocument)stack_.pop();
378 current_doc_ = saved.doc;
379 current_contents_ = saved.contents;
380 current_node_ = saved.tagname;
381 } else {
382 current_doc_ = new Document();
383 current_contents_ = "";
384 current_node_ = "";
385 }
386 }
387 }
[8521]388}
Note: See TracBrowser for help on using the repository browser.