source: indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java@ 18444

Last change on this file since 18444 was 18444, checked in by davidb, 15 years ago

Modifications for incremental building to support files that need to be deleted

  • Property svn:keywords set to Author Date Id Revision
File size: 11.7 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneIndexer.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26
27package org.greenstone.LuceneWrapper;
28
29
30import java.io.*;
31import java.util.Vector;
32
33import org.xml.sax.Attributes;
34import org.xml.sax.helpers.DefaultHandler;
35import org.xml.sax.InputSource;
36import org.xml.sax.SAXException;
37import org.xml.sax.XMLReader;
38
39import javax.xml.parsers.SAXParser;
40import javax.xml.parsers.SAXParserFactory;
41
42import org.apache.lucene.document.Document;
43import org.apache.lucene.document.Field;
44import org.apache.lucene.index.IndexWriter;
45import org.apache.lucene.index.Term;
46import org.apache.lucene.analysis.Analyzer;
47
48import java.util.Stack;
49import java.io.FileInputStream;
50import java.io.File;
51import java.io.StringReader;
52import java.net.URL;
53
54
55/**
56 * class for indexing XML generated by lucenebuildproc.pm
57 */
58
59public class GS2LuceneIndexer {
60
61 protected static boolean debug = false;
62
63 protected static void debug(String message)
64 {
65 if (debug) {
66 System.err.println(message);
67 }
68 }
69
70
71 public static void main (String args[]) throws Exception
72 {
73 int verbosity = 1;
74 // Default is to edit the existing index
75 boolean create_new_index = false;
76
77 Vector filtered_args = new Vector();
78
79 int argc = args.length;
80 int i = 0;
81 while (i<argc) {
82 if (args[i].startsWith("-")) {
83
84 // -removeold causes the existing index to be overwritten
85 if (args[i].equals("-removeold")) {
86 create_new_index = true;
87 }
88
89 // -verbosity [num]
90 else if (args[i].equals("-verbosity")) {
91 i++;
92 if (i<argc) {
93 verbosity = Integer.parseInt(args[i]);
94 if (verbosity>=5) {
95 debug = true;
96 }
97 }
98 }
99 else if (args[i].equals("-debug")) {
100 debug = true;
101 }
102 else {
103 System.out.println("Unrecognised option: " + args[i]);
104 }
105 }
106 else {
107 filtered_args.add((Object)args[i]);
108 }
109 i++;
110 }
111
112 if (filtered_args.size() != 3) {
113 System.out.println("Usage: java GS2LuceneIndexer [-removeold|-verbosity [num]] doc-tag-level building_dir index");
114 return;
115 }
116
117 String doc_tag_level = (String)filtered_args.get(0);
118 String building_dirname = (String)filtered_args.get(1);
119 String index_dirname = (String)filtered_args.get(2);
120
121 String import_dirname = building_dirname + File.separator + "text";
122
123 File import_dir = new File(import_dirname);
124 File building_dir = new File(building_dirname);
125
126 if (!import_dir.exists()) {
127 System.out.println("Couldn't find import directory: "+import_dirname);
128 return;
129 }
130
131 File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator);
132 idx_dir.mkdir();
133
134 // Set up indexer
135 Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index);
136
137 // Read from stdin the files to process
138 try {
139 InputStreamReader isr = new InputStreamReader(System.in, "UTF-8");
140 BufferedReader brin = new BufferedReader(isr);
141
142 StringBuffer xml_text = new StringBuffer(1024);
143 String line = null;
144 while ((line = brin.readLine()) != null) {
145 xml_text.append(line);
146
147 debug("Got line " + line);
148
149 if (line.endsWith("</Delete>")) {
150
151 indexer.delete(xml_text.toString());
152 xml_text = new StringBuffer(1024);
153 }
154 else if (line.startsWith("</Doc>")) {
155 indexer.index(xml_text.toString());
156 xml_text = new StringBuffer(1024);
157 }
158 }
159
160 brin.close();
161 isr.close();
162
163 } catch (IOException e) {
164 System.err.println("Error: unable to read from stdin");
165 e.printStackTrace();
166 }
167
168 indexer.finish();
169 }
170
171
172 static public class Indexer extends DefaultHandler
173 {
174 IndexWriter writer_ = null;
175 Analyzer analyzer_ = null;
176 SAXParser sax_parser_ = null;
177 String doc_tag_level_ = null;
178
179 Stack stack_ = null;
180 String path_ = "";
181
182 Document current_doc_ = null;
183 String current_node_ = "";
184 String current_doc_oid_ = "";
185 String indexable_current_node_ = "";
186 String current_contents_ = "";
187
188 String mode_ = "";
189 protected String file_id_ = null;
190
191 static private String[] stop_words = GS2Analyzer.STOP_WORDS;
192
193
194 /** pass in true if want to create a new index, false if want to use the existing one */
195 public Indexer (String doc_tag_level, File index_dir, boolean create)
196 {
197 doc_tag_level_ = doc_tag_level;
198
199 try {
200 stack_ = new Stack();
201 SAXParserFactory sax_factory = SAXParserFactory.newInstance();
202 sax_parser_ = sax_factory.newSAXParser();
203
204 XMLReader reader = sax_parser_.getXMLReader();
205 reader.setFeature("http://xml.org/sax/features/validation", false);
206
207 analyzer_ = new GS2Analyzer(stop_words);
208
209 writer_ = new IndexWriter(index_dir.getPath(), analyzer_, create);
210 // by default, will only index 10,000 words per document
211 // Can throw out_of_memory errors
212 writer_.setMaxFieldLength(Integer.MAX_VALUE);
213 if (create) {
214 writer_.optimize();
215 }
216 }
217 catch (Exception e) {
218 // We need to know if creating/opening the index fails
219 e.printStackTrace();
220 }
221 }
222
223 /** index one document */
224 public void index (String file_id, File file)
225 {
226 mode_ = "index";
227 file_id_ = file_id;
228 path_ = "";
229 String base_path = file.getPath();
230 base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
231
232 try {
233 sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
234 }
235 catch (Exception e) {
236 println("parse error:");
237 e.printStackTrace();
238 }
239 }
240
241 /** index one document stored as string*/
242 public void index (String xml_text)
243 {
244 mode_ = "index";
245 file_id_ = "<xml doc on stdin>";
246 path_ = "";
247
248 try {
249 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
250 }
251 catch (Exception e) {
252 println("parse error:");
253 e.printStackTrace();
254 }
255 }
256
257 /** delete one document, based on doc_id in <Delete>doc_id</Delete> */
258 public void delete(String xml_text)
259 {
260 mode_ = "delete";
261 file_id_ = "<delete doc>";
262 path_ = "";
263
264 try {
265 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
266 }
267 catch (Exception e) {
268 println("parse error:");
269 e.printStackTrace();
270 }
271 }
272
273 public void finish()
274 {
275 /** optimise the index */
276 try {
277 writer_.optimize();
278 writer_.close();
279 }
280 catch (Exception e) {
281 }
282 }
283
284 protected void print(String s)
285 {
286 System.out.print(s);
287 }
288
289 protected void println(String s)
290 {
291 System.out.println(s);
292 }
293
294 public void startDocument() throws SAXException
295 {
296 if (mode_.equals("index")) {
297 println("Starting to index " + file_id_);
298 print("[");
299 }
300 }
301
302 public void endDocument() throws SAXException
303 {
304 if (mode_.equals("index")) {
305 println("]");
306 println("... indexing finished.");
307 }
308 }
309
310 public void startElement(String uri, String localName, String qName, Attributes atts)
311 throws SAXException
312 {
313 if (mode_.equals("index")) {
314 path_ = appendPathLink(path_, qName, atts);
315
316 if (qName.equals(doc_tag_level_)) {
317 pushOnStack(); // start new doc
318 current_node_ = qName;
319
320 String node_id = atts.getValue("gs2:id");
321 print(" " + qName + ": " + node_id );
322 current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED));
323
324 current_doc_oid_ = atts.getValue("gs2:docOID");
325 current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.UN_TOKENIZED));
326 }
327
328 if (isIndexable(atts)) {
329 indexable_current_node_ = qName;
330 }
331 else {
332 indexable_current_node_ = "";
333 }
334 }
335 }
336
337 public static boolean isIndexable(Attributes atts)
338 {
339 boolean is_indexable = false;
340
341 String index = atts.getValue("index");
342 if (index!=null) {
343 if (index.equals("1")) {
344 is_indexable = true;
345 }
346 }
347 return is_indexable;
348 }
349
350 public void endElement(String uri, String localName, String qName) throws SAXException
351 {
352 if (mode_.equals("index")) {
353 if (qName.equals(indexable_current_node_))
354 {
355 current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
356 // The byXX fields are used for sorting search results
357 // We don't want to do that for Text or AllFields fields
358 // They need to be untokenised for sorting
359 if (!qName.equals("TX") && !qName.equals("ZZ"))
360 {
361 current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO));
362 }
363
364 current_contents_ = "";
365 }
366
367 if (qName.equals(doc_tag_level_)) {
368 try {
369 writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_, analyzer_);
370 }
371 catch (java.io.IOException e) {
372 e.printStackTrace();
373 }
374 popOffStack(); // end document
375 }
376
377 path_ = removePathLink(path_);
378 }
379 else if (mode_.equals("delete")) {
380 if (qName.equals("Delete")) {
381 try {
382 deleteDocument(current_contents_);
383 current_contents_ = "";
384 }
385 catch (java.io.IOException e) {
386 e.printStackTrace();
387 }
388 }
389 }
390 }
391
392 public void characters(char ch[], int start, int length) throws SAXException
393 {
394 String data = new String(ch, start, length).trim();
395 if (data.length() > 0 ) {
396 current_contents_ += data;
397 }
398 }
399
400 protected String appendPathLink(String path, String qName, Attributes atts)
401 {
402
403 path = path + "/"+qName;
404 if (atts.getLength()>0) {
405 String id = atts.getValue("gs2:id");
406 if (id != null) {
407 path += "[@gs2:id='"+id+"']";
408 }
409 else {
410 id = atts.getValue("gs3:id");
411 if (id != null) {
412 path += "[@gs3:id='"+id+"']";
413 }
414 }
415 }
416 return path;
417 }
418
419 protected String removePathLink(String path)
420 {
421
422 int i=path.lastIndexOf('/');
423 if (i==-1) {
424 path="";
425 } else {
426 path = path.substring(0, i);
427 }
428 return path;
429 }
430
431
432 /** these are what we save on the stack */
433 private class MyDocument
434 {
435 public Document doc = null;
436 public String contents = null;
437 public String tagname = "";
438
439 }
440
441
442 protected void pushOnStack()
443 {
444 if (current_doc_ != null) {
445 MyDocument save = new MyDocument();
446 save.doc = current_doc_;
447 save.contents = current_contents_;
448 save.tagname = current_node_;
449 stack_.push(save);
450 }
451 current_doc_ = new Document();
452 current_contents_ = "";
453 current_node_ = "";
454 }
455
456 protected void popOffStack()
457 {
458 if (!stack_.empty()) {
459 MyDocument saved = (MyDocument)stack_.pop();
460 current_doc_ = saved.doc;
461 current_contents_ = saved.contents;
462 current_node_ = saved.tagname;
463 } else {
464 current_doc_ = new Document();
465 current_contents_ = "";
466 current_node_ = "";
467 }
468 }
469
470
471 protected void deleteDocument(String doc_id)
472 throws IOException
473 {
474 debug("GS2LuceneDelete.deleteDocument(" + doc_id + ")");
475 debug("- Initial number of documents in index: " + writer_.docCount());
476 writer_.deleteDocuments(new Term("docOID", doc_id));
477 debug("- Final number of documents in index: " + writer_.docCount());
478 }
479
480
481 }
482}
Note: See TracBrowser for help on using the repository browser.