source: gs3-extensions/solr/trunk/src/src/java/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java@ 24641

Last change on this file since 24641 was 24641, checked in by davidb, 13 years ago

Initial cut at Greenstone3 runtime code to support Solr. Solr code based on version 3.3, so this also include an upgraded version of the LuceneWrapper code (gs2build/common-src/indexers/lucene-gs) that works with this version of the support jar files

  • Property svn:executable set to *
File size: 12.1 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneIndexer.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26
27package org.greenstone.LuceneWrapper;
28
29
30import java.io.*;
31import java.util.Vector;
32
33import org.xml.sax.Attributes;
34import org.xml.sax.helpers.DefaultHandler;
35import org.xml.sax.InputSource;
36import org.xml.sax.SAXException;
37import org.xml.sax.XMLReader;
38
39import javax.xml.parsers.SAXParser;
40import javax.xml.parsers.SAXParserFactory;
41
42import org.apache.lucene.document.Document;
43import org.apache.lucene.document.Field;
44import org.apache.lucene.index.IndexWriter;
45import org.apache.lucene.index.Term;
46import org.apache.lucene.analysis.Analyzer;
47
48import org.apache.lucene.store.SimpleFSDirectory;
49import org.apache.lucene.index.IndexWriter.MaxFieldLength;
50
51import java.util.Stack;
52import java.io.FileInputStream;
53import java.io.File;
54import java.io.StringReader;
55import java.net.URL;
56
57
58/**
59 * class for indexing XML generated by lucenebuildproc.pm
60 */
61
62public class GS2LuceneIndexer {
63
64 protected static boolean debug = false;
65
66 protected static void debug(String message)
67 {
68 if (debug) {
69 System.err.println(message);
70 }
71 }
72
73
74 public static void main (String args[]) throws Exception
75 {
76 int verbosity = 1;
77 // Default is to edit the existing index
78 boolean create_new_index = false;
79
80 Vector filtered_args = new Vector();
81
82 int argc = args.length;
83 int i = 0;
84 while (i<argc) {
85 if (args[i].startsWith("-")) {
86
87 // -removeold causes the existing index to be overwritten
88 if (args[i].equals("-removeold")) {
89 create_new_index = true;
90 }
91
92 // -verbosity [num]
93 else if (args[i].equals("-verbosity")) {
94 i++;
95 if (i<argc) {
96 verbosity = Integer.parseInt(args[i]);
97 if (verbosity>=5) {
98 debug = true;
99 }
100 }
101 }
102 else if (args[i].equals("-debug")) {
103 debug = true;
104 }
105 else {
106 System.err.println("Unrecognised option: " + args[i]);
107 }
108 }
109 else {
110 filtered_args.add((Object)args[i]);
111 }
112 i++;
113 }
114
115 if (filtered_args.size() != 3) {
116 System.err.println("Usage: java GS2LuceneIndexer [-removeold|-verbosity [num]] doc-tag-level building_dir index");
117 return;
118 }
119
120 String doc_tag_level = (String)filtered_args.get(0);
121 String building_dirname = (String)filtered_args.get(1);
122 String index_dirname = (String)filtered_args.get(2);
123
124 String import_dirname = building_dirname + File.separator + "text";
125
126 File import_dir = new File(import_dirname);
127 File building_dir = new File(building_dirname);
128
129 if (!import_dir.exists()) {
130 System.err.println("Couldn't find import directory: "+import_dirname);
131 return;
132 }
133
134 File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator);
135 idx_dir.mkdir();
136
137 // Set up indexer
138 Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index);
139
140 // Read from stdin the files to process
141 try {
142 InputStreamReader isr = new InputStreamReader(System.in, "UTF-8");
143 BufferedReader brin = new BufferedReader(isr);
144
145 StringBuffer xml_text = new StringBuffer(1024);
146 String line = null;
147 while ((line = brin.readLine()) != null) {
148 xml_text.append(line);
149 xml_text.append(" ");
150
151 debug("Got line " + line);
152
153 if (line.endsWith("</Delete>")) {
154
155 indexer.delete(xml_text.toString());
156 xml_text = new StringBuffer(1024);
157 }
158 else if (line.startsWith("</Doc>")) {
159 indexer.index(xml_text.toString());
160 xml_text = new StringBuffer(1024);
161 }
162 }
163
164 brin.close();
165 isr.close();
166
167 } catch (IOException e) {
168 System.err.println("Error: unable to read from stdin");
169 e.printStackTrace();
170 }
171
172 indexer.finish();
173 }
174
175
176 static public class Indexer extends DefaultHandler
177 {
178 IndexWriter writer_ = null;
179 Analyzer analyzer_ = null;
180 SAXParser sax_parser_ = null;
181 String doc_tag_level_ = null;
182
183 Stack stack_ = null;
184 String path_ = "";
185
186 Document current_doc_ = null;
187 String current_node_ = "";
188 String current_doc_oid_ = "";
189 String indexable_current_node_ = "";
190 String current_contents_ = "";
191
192 String mode_ = "";
193 protected String file_id_ = null;
194
195 /** pass in true if want to create a new index, false if want to use the existing one */
196 public Indexer (String doc_tag_level, File index_dir, boolean create)
197 {
198 doc_tag_level_ = doc_tag_level;
199
200 try {
201 stack_ = new Stack();
202 SAXParserFactory sax_factory = SAXParserFactory.newInstance();
203 sax_parser_ = sax_factory.newSAXParser();
204
205 XMLReader reader = sax_parser_.getXMLReader();
206 reader.setFeature("http://xml.org/sax/features/validation", false);
207
208 SimpleFSDirectory index_dir_dir = new SimpleFSDirectory(new File(index_dir.getPath()));
209
210 analyzer_ = new GS2Analyzer(); // uses build in stop_word_set
211
212 writer_ = new IndexWriter(index_dir_dir, analyzer_, create, MaxFieldLength.UNLIMITED);
213
214 // by default, will only index 10,000 words per document
215 // Can throw out_of_memory errors
216 writer_.setMaxFieldLength(Integer.MAX_VALUE);
217 if (create) {
218 writer_.optimize();
219 }
220 }
221 catch (Exception e) {
222 // We need to know if creating/opening the index fails
223 e.printStackTrace();
224 }
225 }
226
227 /** index one document */
228 public void index (String file_id, File file)
229 {
230 mode_ = "add";
231 file_id_ = file_id;
232 path_ = "";
233 String base_path = file.getPath();
234 base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
235
236 try {
237 sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
238 }
239 catch (Exception e) {
240 println("parse error:");
241 e.printStackTrace();
242 }
243 }
244
245 /** index one document stored as string*/
246 public void index (String xml_text)
247 {
248 mode_ = "add";
249 file_id_ = "<xml doc on stdin>";
250 path_ = "";
251
252 try {
253 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
254 }
255 catch (Exception e) {
256 println("parse error:");
257 e.printStackTrace();
258 }
259 }
260
261 /** delete one document, based on doc_id in <Delete>doc_id</Delete> */
262 public void delete(String xml_text)
263 {
264 mode_ = "delete";
265 file_id_ = "<delete doc>";
266 path_ = "";
267
268 try {
269 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
270 }
271 catch (Exception e) {
272 println("parse error:");
273 e.printStackTrace();
274 }
275 }
276
277 public void finish()
278 {
279 /** optimise the index */
280 try {
281 writer_.optimize();
282 writer_.close();
283 }
284 catch (Exception e) {
285 }
286 }
287
288 protected void print(String s)
289 {
290 System.err.print(s);
291 }
292
293 protected void println(String s)
294 {
295 System.err.println(s);
296 }
297
298 public void startDocument() throws SAXException
299 {
300 println("Starting to process " + file_id_);
301 print("[");
302 }
303
304 public void endDocument() throws SAXException
305 {
306 println("]");
307 println("... processing finished.");
308 }
309
310 public void startElement(String uri, String localName, String qName, Attributes atts)
311 throws SAXException
312 {
313 path_ = appendPathLink(path_, qName, atts);
314
315 if (qName.equals(doc_tag_level_)) {
316 mode_ = atts.getValue("gs2:mode");
317
318 pushOnStack(); // start new doc
319 current_node_ = qName;
320
321 //String node_id = atts.getValue("gs2:id");
322 //print(" " + qName + ": " + node_id + " (" + mode_ + ")" );
323 //current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.NOT_ANALYZED));
324
325 current_doc_oid_ = atts.getValue("gs2:docOID");
326 print(" " + qName + ": " + current_doc_oid_ + " (" + mode_ + ")" );
327 current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.NOT_ANALYZED));
328 }
329
330 if (isIndexable(atts)) {
331 indexable_current_node_ = qName;
332 }
333 else {
334 indexable_current_node_ = "";
335 }
336 }
337
338 public static boolean isIndexable(Attributes atts)
339 {
340 boolean is_indexable = false;
341
342 String index = atts.getValue("index");
343 if (index!=null) {
344 if (index.equals("1")) {
345 is_indexable = true;
346 }
347 }
348 return is_indexable;
349 }
350
351 public void endElement(String uri, String localName, String qName) throws SAXException
352 {
353 if (mode_.equals("delete")) {
354 try {
355 deleteDocument(current_doc_oid_);
356 }
357 catch (java.io.IOException e) {
358 e.printStackTrace();
359 }
360 }
361 else if (mode_.equals("add") || mode_.equals("update")) {
362 if (qName.equals(indexable_current_node_))
363 {
364 current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES));
365 // The byXX fields are used for sorting search results
366 // We don't want to do that for Text or AllFields fields
367 // They need to be untokenised for sorting
368 if (!qName.equals("TX") && !qName.equals("ZZ"))
369 {
370 current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.NO));
371 }
372
373 current_contents_ = "";
374 }
375
376 if (qName.equals(doc_tag_level_)) {
377 try {
378 // perhaps this is more efficient if addDocument()
379 // used for "add" and updateDocument() for "update"
380 writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_, analyzer_);
381 }
382 catch (java.io.IOException e) {
383 e.printStackTrace();
384 }
385 popOffStack(); // end document
386 }
387
388 path_ = removePathLink(path_);
389 }
390 }
391
392 public void characters(char ch[], int start, int length) throws SAXException
393 {
394 String data = new String(ch, start, length).trim();
395 if (data.length() > 0 ) {
396 current_contents_ += data;
397 }
398 }
399
400 protected String appendPathLink(String path, String qName, Attributes atts)
401 {
402
403 path = path + "/"+qName;
404 if (atts.getLength()>0) {
405 // was gs2:id, changed to gs2:docOID --kjdon
406 String id = atts.getValue("gs2:docOID");
407 if (id != null) {
408 path += "[@gs2:docOID='"+id+"']";
409 }
410 else {
411 // is this ever used? not in perl currently
412 id = atts.getValue("gs3:id");
413 if (id != null) {
414 path += "[@gs3:id='"+id+"']";
415 }
416 }
417 }
418 return path;
419 }
420
421 protected String removePathLink(String path)
422 {
423
424 int i=path.lastIndexOf('/');
425 if (i==-1) {
426 path="";
427 } else {
428 path = path.substring(0, i);
429 }
430 return path;
431 }
432
433
434 /** these are what we save on the stack */
435 private class MyDocument
436 {
437 public Document doc = null;
438 public String contents = null;
439 public String tagname = "";
440
441 }
442
443
444 protected void pushOnStack()
445 {
446 if (current_doc_ != null) {
447 MyDocument save = new MyDocument();
448 save.doc = current_doc_;
449 save.contents = current_contents_;
450 save.tagname = current_node_;
451 stack_.push(save);
452 }
453 current_doc_ = new Document();
454 current_contents_ = "";
455 current_node_ = "";
456 }
457
458 protected void popOffStack()
459 {
460 if (!stack_.empty()) {
461 MyDocument saved = (MyDocument)stack_.pop();
462 current_doc_ = saved.doc;
463 current_contents_ = saved.contents;
464 current_node_ = saved.tagname;
465 } else {
466 current_doc_ = new Document();
467 current_contents_ = "";
468 current_node_ = "";
469 }
470 }
471
472
473 protected void deleteDocument(String doc_id)
474 throws IOException
475 {
476 debug("GS2LuceneDelete.deleteDocument(" + doc_id + ")");
477 debug("- Initial number of documents in index: " + writer_.numDocs());
478 writer_.deleteDocuments(new Term("docOID", doc_id));
479 debug("- Final number of documents in index: " + writer_.numDocs());
480 }
481
482
483 }
484}
Note: See TracBrowser for help on using the repository browser.