source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper3/GS2LuceneIndexer.java@ 27359

Last change on this file since 27359 was 27359, checked in by kjdon, 11 years ago

sort fields are now separate from index fields. index fields will be like <TI index=1> and sort fields will be like <byTI index=1 tokenize=0>

  • Property svn:executable set to *
File size: 12.6 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneIndexer.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26
27package org.greenstone.LuceneWrapper3;
28
29
30import java.io.*;
31import java.util.Vector;
32
33import org.xml.sax.Attributes;
34import org.xml.sax.helpers.DefaultHandler;
35import org.xml.sax.InputSource;
36import org.xml.sax.SAXException;
37import org.xml.sax.XMLReader;
38
39import javax.xml.parsers.SAXParser;
40import javax.xml.parsers.SAXParserFactory;
41
42import org.apache.lucene.document.Document;
43import org.apache.lucene.document.Field;
44import org.apache.lucene.index.IndexWriter;
45import org.apache.lucene.index.Term;
46import org.apache.lucene.analysis.Analyzer;
47
48import org.apache.lucene.store.SimpleFSDirectory;
49import org.apache.lucene.index.IndexWriter.MaxFieldLength;
50
51import java.util.Stack;
52import java.io.FileInputStream;
53import java.io.File;
54import java.io.StringReader;
55import java.net.URL;
56
57
58/**
59 * class for indexing XML generated by lucenebuildproc.pm
60 */
61
62public class GS2LuceneIndexer {
63
64 protected static boolean debug = false;
65
66 protected static void debug(String message)
67 {
68 if (debug) {
69 System.err.println(message);
70 }
71 }
72
73
74 public static void main (String args[]) throws Exception
75 {
76 int verbosity = 1;
77 // Default is to edit the existing index
78 boolean create_new_index = false;
79
80 Vector filtered_args = new Vector();
81
82 int argc = args.length;
83 int i = 0;
84 while (i<argc) {
85 if (args[i].startsWith("-")) {
86
87 // -removeold causes the existing index to be overwritten
88 if (args[i].equals("-removeold")) {
89 create_new_index = true;
90 }
91
92 // -verbosity [num]
93 else if (args[i].equals("-verbosity")) {
94 i++;
95 if (i<argc) {
96 verbosity = Integer.parseInt(args[i]);
97 if (verbosity>=5) {
98 debug = true;
99 }
100 }
101 }
102 else if (args[i].equals("-debug")) {
103 debug = true;
104 }
105 else {
106 System.err.println("Unrecognised option: " + args[i]);
107 }
108 }
109 else {
110 filtered_args.add((Object)args[i]);
111 }
112 i++;
113 }
114
115 if (filtered_args.size() != 3) {
116 System.err.println("Usage: java GS2LuceneIndexer [-removeold|-verbosity [num]] doc-tag-level building_dir index");
117 return;
118 }
119
120 String doc_tag_level = (String)filtered_args.get(0);
121 String building_dirname = (String)filtered_args.get(1);
122 String index_dirname = (String)filtered_args.get(2);
123
124 String import_dirname = building_dirname + File.separator + "text";
125
126 File import_dir = new File(import_dirname);
127 File building_dir = new File(building_dirname);
128
129 if (!import_dir.exists()) {
130 System.err.println("Couldn't find import directory: "+import_dirname);
131 return;
132 }
133
134 File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator);
135 idx_dir.mkdir();
136
137 // Set up indexer
138 Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index);
139
140 // Read from stdin the files to process
141 try {
142 InputStreamReader isr = new InputStreamReader(System.in, "UTF-8");
143 BufferedReader brin = new BufferedReader(isr);
144
145 StringBuffer xml_text = new StringBuffer(1024);
146 String line = null;
147 while ((line = brin.readLine()) != null) {
148 xml_text.append(line);
149 xml_text.append(" ");
150
151 debug("Got line " + line);
152
153 if (line.endsWith("</Delete>")) {
154
155 indexer.delete(xml_text.toString());
156 xml_text = new StringBuffer(1024);
157 }
158 else if (line.startsWith("</Doc>")) {
159 indexer.index(xml_text.toString());
160 xml_text = new StringBuffer(1024);
161 }
162 }
163
164 brin.close();
165 isr.close();
166
167 } catch (IOException e) {
168 System.err.println("Error: unable to read from stdin");
169 e.printStackTrace();
170 }
171
172 indexer.finish();
173 }
174
175
176 static public class Indexer extends DefaultHandler
177 {
178 IndexWriter writer_ = null;
179 Analyzer analyzer_ = null;
180 SAXParser sax_parser_ = null;
181 String doc_tag_level_ = null;
182
183 Stack stack_ = null;
184 String path_ = "";
185
186 Document current_doc_ = null;
187 String current_node_ = "";
188 String current_doc_oid_ = "";
189 String indexable_current_node_ = "";
190 boolean tokenize = true;
191 String current_contents_ = "";
192
193 String mode_ = "";
194 protected String file_id_ = null;
195
196 /** pass in true if want to create a new index, false if want to use the existing one */
197 public Indexer (String doc_tag_level, File index_dir, boolean create)
198 {
199 doc_tag_level_ = doc_tag_level;
200
201 try {
202 stack_ = new Stack();
203 SAXParserFactory sax_factory = SAXParserFactory.newInstance();
204 sax_parser_ = sax_factory.newSAXParser();
205
206 XMLReader reader = sax_parser_.getXMLReader();
207 reader.setFeature("http://xml.org/sax/features/validation", false);
208
209 SimpleFSDirectory index_dir_dir = new SimpleFSDirectory(new File(index_dir.getPath()));
210
211 analyzer_ = new GS2Analyzer(); // uses build in stop_word_set
212
213 writer_ = new IndexWriter(index_dir_dir, analyzer_, create, MaxFieldLength.UNLIMITED);
214
215 // by default, will only index 10,000 words per document
216 // Can throw out_of_memory errors
217 writer_.setMaxFieldLength(Integer.MAX_VALUE);
218 if (create) {
219 writer_.optimize();
220 }
221 }
222 catch (Exception e) {
223 // We need to know if creating/opening the index fails
224 e.printStackTrace();
225 }
226 }
227
228 /** index one document */
229 public void index (String file_id, File file)
230 {
231 mode_ = "add";
232 file_id_ = file_id;
233 path_ = "";
234 String base_path = file.getPath();
235 base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
236
237 try {
238 sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
239 }
240 catch (Exception e) {
241 println("parse error:");
242 e.printStackTrace();
243 }
244 }
245
246 /** index one document stored as string*/
247 public void index (String xml_text)
248 {
249 mode_ = "add";
250 file_id_ = "<xml doc on stdin>";
251 path_ = "";
252
253 try {
254 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
255 }
256 catch (Exception e) {
257 println("parse error:");
258 e.printStackTrace();
259 }
260 }
261
262 /** delete one document, based on doc_id in <Delete>doc_id</Delete> */
263 public void delete(String xml_text)
264 {
265 mode_ = "delete";
266 file_id_ = "<delete doc>";
267 path_ = "";
268
269 try {
270 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
271 }
272 catch (Exception e) {
273 println("parse error:");
274 e.printStackTrace();
275 }
276 }
277
278 public void finish()
279 {
280 /** optimise the index */
281 try {
282 writer_.optimize();
283 writer_.close();
284 }
285 catch (Exception e) {
286 }
287 }
288
289 protected void print(String s)
290 {
291 System.err.print(s);
292 }
293
294 protected void println(String s)
295 {
296 System.err.println(s);
297 }
298
299 public void startDocument() throws SAXException
300 {
301 println("Starting to process " + file_id_);
302 print("[");
303 }
304
305 public void endDocument() throws SAXException
306 {
307 println("]");
308 println("... processing finished.");
309 }
310
311 public void startElement(String uri, String localName, String qName, Attributes atts)
312 throws SAXException
313 {
314 path_ = appendPathLink(path_, qName, atts);
315
316 if (qName.equals(doc_tag_level_)) {
317 mode_ = atts.getValue("gs2:mode");
318
319 pushOnStack(); // start new doc
320 current_node_ = qName;
321
322 //String node_id = atts.getValue("gs2:id");
323 //print(" " + qName + ": " + node_id + " (" + mode_ + ")" );
324 //current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.NOT_ANALYZED));
325
326 current_doc_oid_ = atts.getValue("gs2:docOID");
327 print(" " + qName + ": " + current_doc_oid_ + " (" + mode_ + ")" );
328 current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.NOT_ANALYZED));
329 }
330
331 if (isIndexable(atts)) {
332 indexable_current_node_ = qName;
333 if (isTokenized(atts)) {
334 tokenize = true;
335 } else {
336 tokenize = false;
337 }
338 }
339 else {
340 indexable_current_node_ = "";
341 }
342 }
343
344 public static boolean isTokenized(Attributes atts) {
345 boolean tokenize = true;
346 String tok = atts.getValue("tokenize");
347 if (tok!=null && tok.equals("0")) {
348 tokenize = false;
349 }
350 return tokenize;
351 }
352
353 public static boolean isIndexable(Attributes atts)
354 {
355 boolean is_indexable = false;
356
357 String index = atts.getValue("index");
358 if (index!=null) {
359 if (index.equals("1")) {
360 is_indexable = true;
361 }
362 }
363 return is_indexable;
364 }
365
366 public void endElement(String uri, String localName, String qName) throws SAXException
367 {
368 if (mode_.equals("delete")) {
369 try {
370 deleteDocument(current_doc_oid_);
371 }
372 catch (java.io.IOException e) {
373 e.printStackTrace();
374 }
375 }
376 else if (mode_.equals("add") || mode_.equals("update")) {
377 if (qName.equals(indexable_current_node_))
378 {
379 if (tokenize) {
380 current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES));
381 } else {
382 current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.NO));
383 }
384 // // The byXX fields are used for sorting search results
385 // // We don't want to do that for Text or AllFields fields
386 // // They need to be untokenised for sorting
387 // if (!qName.equals("TX") && !qName.equals("ZZ"))
388 // {
389 // current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.NO));
390 // }
391
392 current_contents_ = "";
393 }
394
395 if (qName.equals(doc_tag_level_)) {
396 try {
397 // perhaps this is more efficient if addDocument()
398 // used for "add" and updateDocument() for "update"
399 writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_, analyzer_);
400 }
401 catch (java.io.IOException e) {
402 e.printStackTrace();
403 }
404 popOffStack(); // end document
405 }
406
407 path_ = removePathLink(path_);
408 }
409 }
410
411 public void characters(char ch[], int start, int length) throws SAXException
412 {
413 String data = new String(ch, start, length).trim();
414 if (data.length() > 0 ) {
415 current_contents_ += data;
416 }
417 }
418
419 protected String appendPathLink(String path, String qName, Attributes atts)
420 {
421
422 path = path + "/"+qName;
423 if (atts.getLength()>0) {
424 // was gs2:id, changed to gs2:docOID --kjdon
425 String id = atts.getValue("gs2:docOID");
426 if (id != null) {
427 path += "[@gs2:docOID='"+id+"']";
428 }
429 else {
430 // is this ever used? not in perl currently
431 id = atts.getValue("gs3:id");
432 if (id != null) {
433 path += "[@gs3:id='"+id+"']";
434 }
435 }
436 }
437 return path;
438 }
439
440 protected String removePathLink(String path)
441 {
442
443 int i=path.lastIndexOf('/');
444 if (i==-1) {
445 path="";
446 } else {
447 path = path.substring(0, i);
448 }
449 return path;
450 }
451
452
453 /** these are what we save on the stack */
454 private class MyDocument
455 {
456 public Document doc = null;
457 public String contents = null;
458 public String tagname = "";
459
460 }
461
462
463 protected void pushOnStack()
464 {
465 if (current_doc_ != null) {
466 MyDocument save = new MyDocument();
467 save.doc = current_doc_;
468 save.contents = current_contents_;
469 save.tagname = current_node_;
470 stack_.push(save);
471 }
472 current_doc_ = new Document();
473 current_contents_ = "";
474 current_node_ = "";
475 }
476
477 protected void popOffStack()
478 {
479 if (!stack_.empty()) {
480 MyDocument saved = (MyDocument)stack_.pop();
481 current_doc_ = saved.doc;
482 current_contents_ = saved.contents;
483 current_node_ = saved.tagname;
484 } else {
485 current_doc_ = new Document();
486 current_contents_ = "";
487 current_node_ = "";
488 }
489 }
490
491
492 protected void deleteDocument(String doc_id)
493 throws IOException
494 {
495 debug("GS2LuceneDelete.deleteDocument(" + doc_id + ")");
496 debug("- Initial number of documents in index: " + writer_.numDocs());
497 writer_.deleteDocuments(new Term("docOID", doc_id));
498 debug("- Final number of documents in index: " + writer_.numDocs());
499 }
500
501
502 }
503}
Note: See TracBrowser for help on using the repository browser.