source: indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java@ 19861

Last change on this file since 19861 was 19861, checked in by kjdon, 15 years ago

fixed a bug where the last word on each line was not searchable. When reading in the lines it was concatenating them together without any space in between so the last word was gettign stuck with the first word. Have added a space between each line of text.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.7 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneIndexer.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26
27package org.greenstone.LuceneWrapper;
28
29
30import java.io.*;
31import java.util.Vector;
32
33import org.xml.sax.Attributes;
34import org.xml.sax.helpers.DefaultHandler;
35import org.xml.sax.InputSource;
36import org.xml.sax.SAXException;
37import org.xml.sax.XMLReader;
38
39import javax.xml.parsers.SAXParser;
40import javax.xml.parsers.SAXParserFactory;
41
42import org.apache.lucene.document.Document;
43import org.apache.lucene.document.Field;
44import org.apache.lucene.index.IndexWriter;
45import org.apache.lucene.index.Term;
46import org.apache.lucene.analysis.Analyzer;
47
48import java.util.Stack;
49import java.io.FileInputStream;
50import java.io.File;
51import java.io.StringReader;
52import java.net.URL;
53
54
55/**
56 * class for indexing XML generated by lucenebuildproc.pm
57 */
58
59public class GS2LuceneIndexer {
60
61 protected static boolean debug = false;
62
63 protected static void debug(String message)
64 {
65 if (debug) {
66 System.err.println(message);
67 }
68 }
69
70
71 public static void main (String args[]) throws Exception
72 {
73 int verbosity = 1;
74 // Default is to edit the existing index
75 boolean create_new_index = false;
76
77 Vector filtered_args = new Vector();
78
79 int argc = args.length;
80 int i = 0;
81 while (i<argc) {
82 if (args[i].startsWith("-")) {
83
84 // -removeold causes the existing index to be overwritten
85 if (args[i].equals("-removeold")) {
86 create_new_index = true;
87 }
88
89 // -verbosity [num]
90 else if (args[i].equals("-verbosity")) {
91 i++;
92 if (i<argc) {
93 verbosity = Integer.parseInt(args[i]);
94 if (verbosity>=5) {
95 debug = true;
96 }
97 }
98 }
99 else if (args[i].equals("-debug")) {
100 debug = true;
101 }
102 else {
103 System.out.println("Unrecognised option: " + args[i]);
104 }
105 }
106 else {
107 filtered_args.add((Object)args[i]);
108 }
109 i++;
110 }
111
112 if (filtered_args.size() != 3) {
113 System.out.println("Usage: java GS2LuceneIndexer [-removeold|-verbosity [num]] doc-tag-level building_dir index");
114 return;
115 }
116
117 String doc_tag_level = (String)filtered_args.get(0);
118 String building_dirname = (String)filtered_args.get(1);
119 String index_dirname = (String)filtered_args.get(2);
120
121 String import_dirname = building_dirname + File.separator + "text";
122
123 File import_dir = new File(import_dirname);
124 File building_dir = new File(building_dirname);
125
126 if (!import_dir.exists()) {
127 System.out.println("Couldn't find import directory: "+import_dirname);
128 return;
129 }
130
131 File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator);
132 idx_dir.mkdir();
133
134 // Set up indexer
135 Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index);
136
137 // Read from stdin the files to process
138 try {
139 InputStreamReader isr = new InputStreamReader(System.in, "UTF-8");
140 BufferedReader brin = new BufferedReader(isr);
141
142 StringBuffer xml_text = new StringBuffer(1024);
143 String line = null;
144 while ((line = brin.readLine()) != null) {
145 xml_text.append(line);
146 xml_text.append(" ");
147
148 debug("Got line " + line);
149
150 if (line.endsWith("</Delete>")) {
151
152 indexer.delete(xml_text.toString());
153 xml_text = new StringBuffer(1024);
154 }
155 else if (line.startsWith("</Doc>")) {
156 indexer.index(xml_text.toString());
157 xml_text = new StringBuffer(1024);
158 }
159 }
160
161 brin.close();
162 isr.close();
163
164 } catch (IOException e) {
165 System.err.println("Error: unable to read from stdin");
166 e.printStackTrace();
167 }
168
169 indexer.finish();
170 }
171
172
173 static public class Indexer extends DefaultHandler
174 {
175 IndexWriter writer_ = null;
176 Analyzer analyzer_ = null;
177 SAXParser sax_parser_ = null;
178 String doc_tag_level_ = null;
179
180 Stack stack_ = null;
181 String path_ = "";
182
183 Document current_doc_ = null;
184 String current_node_ = "";
185 String current_doc_oid_ = "";
186 String indexable_current_node_ = "";
187 String current_contents_ = "";
188
189 String mode_ = "";
190 protected String file_id_ = null;
191
192 static private String[] stop_words = GS2Analyzer.STOP_WORDS;
193
194
195 /** pass in true if want to create a new index, false if want to use the existing one */
196 public Indexer (String doc_tag_level, File index_dir, boolean create)
197 {
198 doc_tag_level_ = doc_tag_level;
199
200 try {
201 stack_ = new Stack();
202 SAXParserFactory sax_factory = SAXParserFactory.newInstance();
203 sax_parser_ = sax_factory.newSAXParser();
204
205 XMLReader reader = sax_parser_.getXMLReader();
206 reader.setFeature("http://xml.org/sax/features/validation", false);
207
208 analyzer_ = new GS2Analyzer(stop_words);
209
210 writer_ = new IndexWriter(index_dir.getPath(), analyzer_, create);
211 // by default, will only index 10,000 words per document
212 // Can throw out_of_memory errors
213 writer_.setMaxFieldLength(Integer.MAX_VALUE);
214 if (create) {
215 writer_.optimize();
216 }
217 }
218 catch (Exception e) {
219 // We need to know if creating/opening the index fails
220 e.printStackTrace();
221 }
222 }
223
224 /** index one document */
225 public void index (String file_id, File file)
226 {
227 mode_ = "add";
228 file_id_ = file_id;
229 path_ = "";
230 String base_path = file.getPath();
231 base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
232
233 try {
234 sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
235 }
236 catch (Exception e) {
237 println("parse error:");
238 e.printStackTrace();
239 }
240 }
241
242 /** index one document stored as string*/
243 public void index (String xml_text)
244 {
245 mode_ = "add";
246 file_id_ = "<xml doc on stdin>";
247 path_ = "";
248
249 try {
250 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
251 }
252 catch (Exception e) {
253 println("parse error:");
254 e.printStackTrace();
255 }
256 }
257
258 /** delete one document, based on doc_id in <Delete>doc_id</Delete> */
259 public void delete(String xml_text)
260 {
261 mode_ = "delete";
262 file_id_ = "<delete doc>";
263 path_ = "";
264
265 try {
266 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
267 }
268 catch (Exception e) {
269 println("parse error:");
270 e.printStackTrace();
271 }
272 }
273
274 public void finish()
275 {
276 /** optimise the index */
277 try {
278 writer_.optimize();
279 writer_.close();
280 }
281 catch (Exception e) {
282 }
283 }
284
285 protected void print(String s)
286 {
287 System.out.print(s);
288 }
289
290 protected void println(String s)
291 {
292 System.out.println(s);
293 }
294
295 public void startDocument() throws SAXException
296 {
297 println("Starting to process " + file_id_);
298 print("[");
299 }
300
301 public void endDocument() throws SAXException
302 {
303 println("]");
304 println("... processing finished.");
305 }
306
307 public void startElement(String uri, String localName, String qName, Attributes atts)
308 throws SAXException
309 {
310 path_ = appendPathLink(path_, qName, atts);
311
312 if (qName.equals(doc_tag_level_)) {
313 mode_ = atts.getValue("gs2:mode");
314
315 pushOnStack(); // start new doc
316 current_node_ = qName;
317
318 String node_id = atts.getValue("gs2:id");
319 print(" " + qName + ": " + node_id + " (" + mode_ + ")" );
320 current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED));
321
322 current_doc_oid_ = atts.getValue("gs2:docOID");
323 current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.UN_TOKENIZED));
324 }
325
326 if (isIndexable(atts)) {
327 indexable_current_node_ = qName;
328 }
329 else {
330 indexable_current_node_ = "";
331 }
332 }
333
334 public static boolean isIndexable(Attributes atts)
335 {
336 boolean is_indexable = false;
337
338 String index = atts.getValue("index");
339 if (index!=null) {
340 if (index.equals("1")) {
341 is_indexable = true;
342 }
343 }
344 return is_indexable;
345 }
346
347 public void endElement(String uri, String localName, String qName) throws SAXException
348 {
349 if (mode_.equals("delete")) {
350 try {
351 deleteDocument(current_doc_oid_);
352 }
353 catch (java.io.IOException e) {
354 e.printStackTrace();
355 }
356 }
357 else if (mode_.equals("add") || mode_.equals("update")) {
358 if (qName.equals(indexable_current_node_))
359 {
360 current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
361 // The byXX fields are used for sorting search results
362 // We don't want to do that for Text or AllFields fields
363 // They need to be untokenised for sorting
364 if (!qName.equals("TX") && !qName.equals("ZZ"))
365 {
366 current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO));
367 }
368
369 current_contents_ = "";
370 }
371
372 if (qName.equals(doc_tag_level_)) {
373 try {
374 // perhaps this is more efficient if addDocument()
375 // used for "add" and updateDocument() for "update"
376 writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_, analyzer_);
377 }
378 catch (java.io.IOException e) {
379 e.printStackTrace();
380 }
381 popOffStack(); // end document
382 }
383
384 path_ = removePathLink(path_);
385 }
386 }
387
388 public void characters(char ch[], int start, int length) throws SAXException
389 {
390 String data = new String(ch, start, length).trim();
391 if (data.length() > 0 ) {
392 current_contents_ += data;
393 }
394 }
395
396 protected String appendPathLink(String path, String qName, Attributes atts)
397 {
398
399 path = path + "/"+qName;
400 if (atts.getLength()>0) {
401 String id = atts.getValue("gs2:id");
402 if (id != null) {
403 path += "[@gs2:id='"+id+"']";
404 }
405 else {
406 id = atts.getValue("gs3:id");
407 if (id != null) {
408 path += "[@gs3:id='"+id+"']";
409 }
410 }
411 }
412 return path;
413 }
414
415 protected String removePathLink(String path)
416 {
417
418 int i=path.lastIndexOf('/');
419 if (i==-1) {
420 path="";
421 } else {
422 path = path.substring(0, i);
423 }
424 return path;
425 }
426
427
428 /** these are what we save on the stack */
429 private class MyDocument
430 {
431 public Document doc = null;
432 public String contents = null;
433 public String tagname = "";
434
435 }
436
437
438 protected void pushOnStack()
439 {
440 if (current_doc_ != null) {
441 MyDocument save = new MyDocument();
442 save.doc = current_doc_;
443 save.contents = current_contents_;
444 save.tagname = current_node_;
445 stack_.push(save);
446 }
447 current_doc_ = new Document();
448 current_contents_ = "";
449 current_node_ = "";
450 }
451
452 protected void popOffStack()
453 {
454 if (!stack_.empty()) {
455 MyDocument saved = (MyDocument)stack_.pop();
456 current_doc_ = saved.doc;
457 current_contents_ = saved.contents;
458 current_node_ = saved.tagname;
459 } else {
460 current_doc_ = new Document();
461 current_contents_ = "";
462 current_node_ = "";
463 }
464 }
465
466
467 protected void deleteDocument(String doc_id)
468 throws IOException
469 {
470 debug("GS2LuceneDelete.deleteDocument(" + doc_id + ")");
471 debug("- Initial number of documents in index: " + writer_.docCount());
472 writer_.deleteDocuments(new Term("docOID", doc_id));
473 debug("- Final number of documents in index: " + writer_.docCount());
474 }
475
476
477 }
478}
Note: See TracBrowser for help on using the repository browser.