source: indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java@ 18464

Last change on this file since 18464 was 18464, checked in by davidb, 15 years ago

Support for reindexing documents

  • Property svn:keywords set to Author Date Id Revision
File size: 11.7 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneIndexer.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26
27package org.greenstone.LuceneWrapper;
28
29
30import java.io.*;
31import java.util.Vector;
32
33import org.xml.sax.Attributes;
34import org.xml.sax.helpers.DefaultHandler;
35import org.xml.sax.InputSource;
36import org.xml.sax.SAXException;
37import org.xml.sax.XMLReader;
38
39import javax.xml.parsers.SAXParser;
40import javax.xml.parsers.SAXParserFactory;
41
42import org.apache.lucene.document.Document;
43import org.apache.lucene.document.Field;
44import org.apache.lucene.index.IndexWriter;
45import org.apache.lucene.index.Term;
46import org.apache.lucene.analysis.Analyzer;
47
48import java.util.Stack;
49import java.io.FileInputStream;
50import java.io.File;
51import java.io.StringReader;
52import java.net.URL;
53
54
55/**
56 * class for indexing XML generated by lucenebuildproc.pm
57 */
58
59public class GS2LuceneIndexer {
60
61 protected static boolean debug = false;
62
63 protected static void debug(String message)
64 {
65 if (debug) {
66 System.err.println(message);
67 }
68 }
69
70
71 public static void main (String args[]) throws Exception
72 {
73 int verbosity = 1;
74 // Default is to edit the existing index
75 boolean create_new_index = false;
76
77 Vector filtered_args = new Vector();
78
79 int argc = args.length;
80 int i = 0;
81 while (i<argc) {
82 if (args[i].startsWith("-")) {
83
84 // -removeold causes the existing index to be overwritten
85 if (args[i].equals("-removeold")) {
86 create_new_index = true;
87 }
88
89 // -verbosity [num]
90 else if (args[i].equals("-verbosity")) {
91 i++;
92 if (i<argc) {
93 verbosity = Integer.parseInt(args[i]);
94 if (verbosity>=5) {
95 debug = true;
96 }
97 }
98 }
99 else if (args[i].equals("-debug")) {
100 debug = true;
101 }
102 else {
103 System.out.println("Unrecognised option: " + args[i]);
104 }
105 }
106 else {
107 filtered_args.add((Object)args[i]);
108 }
109 i++;
110 }
111
112 if (filtered_args.size() != 3) {
113 System.out.println("Usage: java GS2LuceneIndexer [-removeold|-verbosity [num]] doc-tag-level building_dir index");
114 return;
115 }
116
117 String doc_tag_level = (String)filtered_args.get(0);
118 String building_dirname = (String)filtered_args.get(1);
119 String index_dirname = (String)filtered_args.get(2);
120
121 String import_dirname = building_dirname + File.separator + "text";
122
123 File import_dir = new File(import_dirname);
124 File building_dir = new File(building_dirname);
125
126 if (!import_dir.exists()) {
127 System.out.println("Couldn't find import directory: "+import_dirname);
128 return;
129 }
130
131 File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator);
132 idx_dir.mkdir();
133
134 // Set up indexer
135 Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index);
136
137 // Read from stdin the files to process
138 try {
139 InputStreamReader isr = new InputStreamReader(System.in, "UTF-8");
140 BufferedReader brin = new BufferedReader(isr);
141
142 StringBuffer xml_text = new StringBuffer(1024);
143 String line = null;
144 while ((line = brin.readLine()) != null) {
145 xml_text.append(line);
146
147 debug("Got line " + line);
148
149 if (line.endsWith("</Delete>")) {
150
151 indexer.delete(xml_text.toString());
152 xml_text = new StringBuffer(1024);
153 }
154 else if (line.startsWith("</Doc>")) {
155 indexer.index(xml_text.toString());
156 xml_text = new StringBuffer(1024);
157 }
158 }
159
160 brin.close();
161 isr.close();
162
163 } catch (IOException e) {
164 System.err.println("Error: unable to read from stdin");
165 e.printStackTrace();
166 }
167
168 indexer.finish();
169 }
170
171
172 static public class Indexer extends DefaultHandler
173 {
174 IndexWriter writer_ = null;
175 Analyzer analyzer_ = null;
176 SAXParser sax_parser_ = null;
177 String doc_tag_level_ = null;
178
179 Stack stack_ = null;
180 String path_ = "";
181
182 Document current_doc_ = null;
183 String current_node_ = "";
184 String current_doc_oid_ = "";
185 String indexable_current_node_ = "";
186 String current_contents_ = "";
187
188 String mode_ = "";
189 protected String file_id_ = null;
190
191 static private String[] stop_words = GS2Analyzer.STOP_WORDS;
192
193
194 /** pass in true if want to create a new index, false if want to use the existing one */
195 public Indexer (String doc_tag_level, File index_dir, boolean create)
196 {
197 doc_tag_level_ = doc_tag_level;
198
199 try {
200 stack_ = new Stack();
201 SAXParserFactory sax_factory = SAXParserFactory.newInstance();
202 sax_parser_ = sax_factory.newSAXParser();
203
204 XMLReader reader = sax_parser_.getXMLReader();
205 reader.setFeature("http://xml.org/sax/features/validation", false);
206
207 analyzer_ = new GS2Analyzer(stop_words);
208
209 writer_ = new IndexWriter(index_dir.getPath(), analyzer_, create);
210 // by default, will only index 10,000 words per document
211 // Can throw out_of_memory errors
212 writer_.setMaxFieldLength(Integer.MAX_VALUE);
213 if (create) {
214 writer_.optimize();
215 }
216 }
217 catch (Exception e) {
218 // We need to know if creating/opening the index fails
219 e.printStackTrace();
220 }
221 }
222
223 /** index one document */
224 public void index (String file_id, File file)
225 {
226 mode_ = "index";
227 file_id_ = file_id;
228 path_ = "";
229 String base_path = file.getPath();
230 base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
231
232 try {
233 sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
234 }
235 catch (Exception e) {
236 println("parse error:");
237 e.printStackTrace();
238 }
239 }
240
241 /** index one document stored as string*/
242 public void index (String xml_text)
243 {
244 mode_ = "index";
245 file_id_ = "<xml doc on stdin>";
246 path_ = "";
247
248 try {
249 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
250 }
251 catch (Exception e) {
252 println("parse error:");
253 e.printStackTrace();
254 }
255 }
256
257 /** delete one document, based on doc_id in <Delete>doc_id</Delete> */
258 public void delete(String xml_text)
259 {
260 mode_ = "delete";
261 file_id_ = "<delete doc>";
262 path_ = "";
263
264 try {
265 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
266 }
267 catch (Exception e) {
268 println("parse error:");
269 e.printStackTrace();
270 }
271 }
272
273 public void finish()
274 {
275 /** optimise the index */
276 try {
277 writer_.optimize();
278 writer_.close();
279 }
280 catch (Exception e) {
281 }
282 }
283
284 protected void print(String s)
285 {
286 System.out.print(s);
287 }
288
289 protected void println(String s)
290 {
291 System.out.println(s);
292 }
293
294 public void startDocument() throws SAXException
295 {
296 println("Starting to process " + file_id_);
297 print("[");
298 }
299
300 public void endDocument() throws SAXException
301 {
302 println("]");
303 println("... processing finished.");
304 }
305
306 public void startElement(String uri, String localName, String qName, Attributes atts)
307 throws SAXException
308 {
309 path_ = appendPathLink(path_, qName, atts);
310
311 if (qName.equals(doc_tag_level_)) {
312 mode_ = atts.getValue("gs2:mode");
313
314 pushOnStack(); // start new doc
315 current_node_ = qName;
316
317 String node_id = atts.getValue("gs2:id");
318 print(" " + qName + ": " + node_id + " (" + mode_ + ")" );
319 current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED));
320
321 current_doc_oid_ = atts.getValue("gs2:docOID");
322 current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.UN_TOKENIZED));
323 }
324
325 if (isIndexable(atts)) {
326 indexable_current_node_ = qName;
327 }
328 else {
329 indexable_current_node_ = "";
330 }
331 }
332
333 public static boolean isIndexable(Attributes atts)
334 {
335 boolean is_indexable = false;
336
337 String index = atts.getValue("index");
338 if (index!=null) {
339 if (index.equals("1")) {
340 is_indexable = true;
341 }
342 }
343 return is_indexable;
344 }
345
346 public void endElement(String uri, String localName, String qName) throws SAXException
347 {
348 if (mode_.equals("delete")) {
349 try {
350 deleteDocument(current_doc_oid_);
351 }
352 catch (java.io.IOException e) {
353 e.printStackTrace();
354 }
355 }
356 else if (mode_.equals("index") || mode_.equals("reindex")) {
357 if (qName.equals(indexable_current_node_))
358 {
359 current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
360 // The byXX fields are used for sorting search results
361 // We don't want to do that for Text or AllFields fields
362 // They need to be untokenised for sorting
363 if (!qName.equals("TX") && !qName.equals("ZZ"))
364 {
365 current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO));
366 }
367
368 current_contents_ = "";
369 }
370
371 if (qName.equals(doc_tag_level_)) {
372 try {
373 // perhaps this is more efficient if addDocument()
374 // used for "index" and updateDocument() for reindex
375 writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_, analyzer_);
376 }
377 catch (java.io.IOException e) {
378 e.printStackTrace();
379 }
380 popOffStack(); // end document
381 }
382
383 path_ = removePathLink(path_);
384 }
385 }
386
387 public void characters(char ch[], int start, int length) throws SAXException
388 {
389 String data = new String(ch, start, length).trim();
390 if (data.length() > 0 ) {
391 current_contents_ += data;
392 }
393 }
394
395 protected String appendPathLink(String path, String qName, Attributes atts)
396 {
397
398 path = path + "/"+qName;
399 if (atts.getLength()>0) {
400 String id = atts.getValue("gs2:id");
401 if (id != null) {
402 path += "[@gs2:id='"+id+"']";
403 }
404 else {
405 id = atts.getValue("gs3:id");
406 if (id != null) {
407 path += "[@gs3:id='"+id+"']";
408 }
409 }
410 }
411 return path;
412 }
413
414 protected String removePathLink(String path)
415 {
416
417 int i=path.lastIndexOf('/');
418 if (i==-1) {
419 path="";
420 } else {
421 path = path.substring(0, i);
422 }
423 return path;
424 }
425
426
427 /** these are what we save on the stack */
428 private class MyDocument
429 {
430 public Document doc = null;
431 public String contents = null;
432 public String tagname = "";
433
434 }
435
436
437 protected void pushOnStack()
438 {
439 if (current_doc_ != null) {
440 MyDocument save = new MyDocument();
441 save.doc = current_doc_;
442 save.contents = current_contents_;
443 save.tagname = current_node_;
444 stack_.push(save);
445 }
446 current_doc_ = new Document();
447 current_contents_ = "";
448 current_node_ = "";
449 }
450
451 protected void popOffStack()
452 {
453 if (!stack_.empty()) {
454 MyDocument saved = (MyDocument)stack_.pop();
455 current_doc_ = saved.doc;
456 current_contents_ = saved.contents;
457 current_node_ = saved.tagname;
458 } else {
459 current_doc_ = new Document();
460 current_contents_ = "";
461 current_node_ = "";
462 }
463 }
464
465
466 protected void deleteDocument(String doc_id)
467 throws IOException
468 {
469 debug("GS2LuceneDelete.deleteDocument(" + doc_id + ")");
470 debug("- Initial number of documents in index: " + writer_.docCount());
471 writer_.deleteDocuments(new Term("docOID", doc_id));
472 debug("- Final number of documents in index: " + writer_.docCount());
473 }
474
475
476 }
477}
Note: See TracBrowser for help on using the repository browser.