source: indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java@ 17804

Last change on this file since 17804 was 17804, checked in by davidb, 15 years ago

Introduction of GS2Analyzer, which overrides default behaviour of StandardAnalyzer to make accent folding of Latin-1 *on*

  • Property svn:keywords set to Author Date Id Revision
File size: 9.9 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneIndexer.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26
27package org.greenstone.LuceneWrapper;
28
29
30import java.io.*;
31import java.util.Vector;
32
33import org.xml.sax.Attributes;
34import org.xml.sax.helpers.DefaultHandler;
35import org.xml.sax.InputSource;
36import org.xml.sax.SAXException;
37import org.xml.sax.XMLReader;
38
39import javax.xml.parsers.SAXParser;
40import javax.xml.parsers.SAXParserFactory;
41
42import org.apache.lucene.document.Document;
43import org.apache.lucene.document.Field;
44import org.apache.lucene.index.IndexWriter;
45import org.apache.lucene.index.Term;
46import org.apache.lucene.analysis.Analyzer;
47
48import java.util.Stack;
49import java.io.FileInputStream;
50import java.io.File;
51import java.io.StringReader;
52import java.net.URL;
53
54
55/**
56 * class for indexing XML generated by lucenebuildproc.pm
57 */
58
59public class GS2LuceneIndexer {
60
61 public static void main (String args[]) throws Exception
62 {
63 int verbosity = 1;
64 // Default is to edit the existing index
65 boolean create_new_index = false;
66
67 Vector filtered_args = new Vector();
68
69 int argc = args.length;
70 int i = 0;
71 while (i<argc) {
72 if (args[i].startsWith("-")) {
73
74 // -removeold causes the existing index to be overwritten
75 if (args[i].equals("-removeold")) {
76 create_new_index = true;
77 }
78
79 // -verbosity [num]
80 else if (args[i].equals("-verbosity")) {
81 i++;
82 if (i<argc) {
83 verbosity = Integer.parseInt(args[i]);
84 }
85 }
86 else {
87 System.out.println("Unrecognised option: " + args[i]);
88 }
89 }
90 else {
91 filtered_args.add((Object)args[i]);
92 }
93 i++;
94 }
95
96 if (filtered_args.size() != 3) {
97 System.out.println("Usage: java GS2LuceneIndexer [-removeold|-verbosity [num]] doc-tag-level building_dir index");
98 return;
99 }
100
101 String doc_tag_level = (String)filtered_args.get(0);
102 String building_dirname = (String)filtered_args.get(1);
103 String index_dirname = (String)filtered_args.get(2);
104
105 String import_dirname = building_dirname + File.separator + "text";
106
107 File import_dir = new File(import_dirname);
108 File building_dir = new File(building_dirname);
109
110 if (!import_dir.exists()) {
111 System.out.println("Couldn't find import directory: "+import_dirname);
112 return;
113 }
114
115 File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator);
116 idx_dir.mkdir();
117
118 // Set up indexer
119 Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index);
120
121 // Read from stdin the files to process
122 try {
123 InputStreamReader isr = new InputStreamReader(System.in, "UTF-8");
124 BufferedReader brin = new BufferedReader(isr);
125
126 StringBuffer xml_text = new StringBuffer(1024);
127 String line = null;
128 while ((line = brin.readLine()) != null) {
129 xml_text.append(line);
130 if (line.startsWith("</Doc>")) {
131 indexer.index(xml_text.toString());
132 xml_text = new StringBuffer(1024);
133 }
134 }
135
136 brin.close();
137 isr.close();
138
139 } catch (IOException e) {
140 System.err.println("Error: unable to read from stdin");
141 e.printStackTrace();
142 }
143
144 indexer.finish();
145 }
146
147
148 static public class Indexer extends DefaultHandler
149 {
150 IndexWriter writer_ = null;
151 Analyzer analyzer_ = null;
152 SAXParser sax_parser_ = null;
153 String doc_tag_level_ = null;
154
155 Stack stack_ = null;
156 String path_ = "";
157
158 Document current_doc_ = null;
159 String current_node_ = "";
160 String current_doc_oid_ = "";
161 String indexable_current_node_ = "";
162 String current_contents_ = "";
163
164 protected String file_id_ = null;
165
166 static private String[] stop_words = GS2Analyzer.STOP_WORDS;
167
168 /** pass in true if want to create a new index, false if want to use the existing one */
169 public Indexer (String doc_tag_level, File index_dir, boolean create)
170 {
171 doc_tag_level_ = doc_tag_level;
172
173 try {
174 stack_ = new Stack();
175 SAXParserFactory sax_factory = SAXParserFactory.newInstance();
176 sax_parser_ = sax_factory.newSAXParser();
177
178 XMLReader reader = sax_parser_.getXMLReader();
179 reader.setFeature("http://xml.org/sax/features/validation", false);
180
181 analyzer_ = new GS2Analyzer(stop_words);
182
183 writer_ = new IndexWriter(index_dir.getPath(), analyzer_, create);
184 // by default, will only index 10,000 words per document
185 // Can throw out_of_memory errors
186 writer_.setMaxFieldLength(Integer.MAX_VALUE);
187 if (create) {
188 writer_.optimize();
189 }
190 }
191 catch (Exception e) {
192 // We need to know if creating/opening the index fails
193 e.printStackTrace();
194 }
195 }
196
197 /** index one document */
198 public void index (String file_id, File file)
199 {
200 file_id_ = file_id;
201 path_ = "";
202 String base_path = file.getPath();
203 base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
204
205 try {
206 sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
207 }
208 catch (Exception e) {
209 println("parse error:");
210 e.printStackTrace();
211 }
212 }
213
214 /** index one document stored as string*/
215 public void index (String xml_text)
216 {
217 file_id_ = "<xml doc on stdin>";
218 path_ = "";
219
220 try {
221 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
222 }
223 catch (Exception e) {
224 println("parse error:");
225 e.printStackTrace();
226 }
227 }
228
229 public void finish()
230 {
231 /** optimise the index */
232 try {
233 writer_.optimize();
234 writer_.close();
235 }
236 catch (Exception e) {
237 }
238 }
239
240 protected void print(String s)
241 {
242 System.out.print(s);
243 }
244
245 protected void println(String s)
246 {
247 System.out.println(s);
248 }
249
250 public void startDocument() throws SAXException
251 {
252 println("Starting to index " + file_id_);
253 print("[");
254 }
255
256 public void endDocument() throws SAXException
257 {
258 println("]");
259 println("... indexing finished.");
260 }
261
262 public void startElement(String uri, String localName, String qName, Attributes atts)
263 throws SAXException
264 {
265 path_ = appendPathLink(path_, qName, atts);
266
267 if (qName.equals(doc_tag_level_)) {
268 pushOnStack(); // start new doc
269 current_node_ = qName;
270
271 String node_id = atts.getValue("gs2:id");
272 print(" " + qName + ": " + node_id );
273 current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.TOKENIZED));
274
275 current_doc_oid_ = atts.getValue("gs2:docOID");
276 current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.TOKENIZED));
277 }
278
279 if (isIndexable(atts)) {
280 indexable_current_node_ = qName;
281 }
282 else {
283 indexable_current_node_ = "";
284 }
285 }
286
287 public static boolean isIndexable(Attributes atts)
288 {
289 boolean is_indexable = false;
290
291 String index = atts.getValue("index");
292 if (index!=null) {
293 if (index.equals("1")) {
294 is_indexable = true;
295 }
296 }
297 return is_indexable;
298 }
299
300 public void endElement(String uri, String localName, String qName) throws SAXException
301 {
302 if (qName.equals(indexable_current_node_))
303 {
304 current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
305 // We only need the term vector for the TX field
306 if (!qName.equals("TX"))
307 {
308 current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.NO));
309 }
310
311 current_contents_ = "";
312 }
313
314 if (qName.equals(doc_tag_level_)) {
315 try {
316 writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_, analyzer_);
317 }
318 catch (java.io.IOException e) {
319 e.printStackTrace();
320 }
321 popOffStack(); // end document
322 }
323
324 path_ = removePathLink(path_);
325 }
326
327 public void characters(char ch[], int start, int length) throws SAXException
328 {
329 String data = new String(ch, start, length).trim();
330 if (data.length() > 0 ) {
331 current_contents_ += data;
332 }
333 }
334
335 protected String appendPathLink(String path, String qName, Attributes atts)
336 {
337
338 path = path + "/"+qName;
339 if (atts.getLength()>0) {
340 String id = atts.getValue("gs2:id");
341 if (id != null) {
342 path += "[@gs2:id='"+id+"']";
343 }
344 else {
345 id = atts.getValue("gs3:id");
346 if (id != null) {
347 path += "[@gs3:id='"+id+"']";
348 }
349 }
350 }
351 return path;
352 }
353
354 protected String removePathLink(String path)
355 {
356
357 int i=path.lastIndexOf('/');
358 if (i==-1) {
359 path="";
360 } else {
361 path = path.substring(0, i);
362 }
363 return path;
364 }
365
366
367 /** these are what we save on the stack */
368 private class MyDocument
369 {
370 public Document doc = null;
371 public String contents = null;
372 public String tagname = "";
373
374 }
375
376
377 protected void pushOnStack()
378 {
379 if (current_doc_ != null) {
380 MyDocument save = new MyDocument();
381 save.doc = current_doc_;
382 save.contents = current_contents_;
383 save.tagname = current_node_;
384 stack_.push(save);
385 }
386 current_doc_ = new Document();
387 current_contents_ = "";
388 current_node_ = "";
389 }
390
391 protected void popOffStack()
392 {
393 if (!stack_.empty()) {
394 MyDocument saved = (MyDocument)stack_.pop();
395 current_doc_ = saved.doc;
396 current_contents_ = saved.contents;
397 current_node_ = saved.tagname;
398 } else {
399 current_doc_ = new Document();
400 current_contents_ = "";
401 current_node_ = "";
402 }
403 }
404 }
405}
Note: See TracBrowser for help on using the repository browser.