source: gsdl/trunk/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago

Undoing change commited in r16582

  • Property svn:keywords set to Author Date Id Revision
File size: 9.8 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneIndexer.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26
27package org.greenstone.LuceneWrapper;
28
29
30import java.io.*;
31import java.util.Vector;
32
33import org.xml.sax.Attributes;
34import org.xml.sax.helpers.DefaultHandler;
35import org.xml.sax.InputSource;
36import org.xml.sax.SAXException;
37import org.xml.sax.XMLReader;
38
39import javax.xml.parsers.SAXParser;
40import javax.xml.parsers.SAXParserFactory;
41
42import org.apache.lucene.document.Document;
43import org.apache.lucene.document.Field;
44import org.apache.lucene.index.IndexWriter;
45import org.apache.lucene.index.Term;
46import org.apache.lucene.analysis.standard.StandardAnalyzer;
47
48import java.util.Stack;
49import java.io.FileInputStream;
50import java.io.File;
51import java.io.StringReader;
52import java.net.URL;
53
54
55/**
56 * class for indexing XML generated by lucenebuildproc.pm
57 */
58
59public class GS2LuceneIndexer {
60
61 public static void main (String args[]) throws Exception
62 {
63
64 int verbosity = 1;
65 // Default is to edit the existing index
66 boolean create_new_index = false;
67
68 Vector filtered_args = new Vector();
69
70 int argc = args.length;
71 int i = 0;
72 while (i<argc) {
73 if (args[i].startsWith("-")) {
74
75 // -removeold causes the existing index to be overwritten
76 if (args[i].equals("-removeold")) {
77 create_new_index = true;
78 }
79
80 // -verbosity [num]
81 else if (args[i].equals("-verbosity")) {
82 i++;
83 if (i<argc) {
84 verbosity = Integer.parseInt(args[i]);
85 }
86 }
87 else {
88 System.out.println("Unrecognised option: " + args[i]);
89 }
90 }
91 else {
92 filtered_args.add((Object)args[i]);
93 }
94 i++;
95 }
96
97 if (filtered_args.size() != 3) {
98 System.out.println("Usage: java GS2LuceneIndexer [-removeold|-verbosity [num]] doc-tag-level building_dir index");
99 return;
100 }
101
102 String doc_tag_level = (String)filtered_args.get(0);
103 String building_dirname = (String)filtered_args.get(1);
104 String index_dirname = (String)filtered_args.get(2);
105
106 String import_dirname = building_dirname + File.separator + "text";
107
108 File import_dir = new File(import_dirname);
109 File building_dir = new File(building_dirname);
110
111 if (!import_dir.exists()) {
112 System.out.println("Couldn't find import directory: "+import_dirname);
113 return;
114 }
115
116 File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator);
117 idx_dir.mkdir();
118
119 // Set up indexer
120 Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index);
121
122 // Read from stdin the files to process
123 try {
124 InputStreamReader isr = new InputStreamReader(System.in, "UTF-8");
125 BufferedReader brin = new BufferedReader(isr);
126
127 StringBuffer xml_text = new StringBuffer(1024);
128 String line = null;
129 while ((line = brin.readLine()) != null) {
130 xml_text.append(line);
131 if (line.startsWith("</Doc>")) {
132 indexer.index(xml_text.toString());
133 xml_text = new StringBuffer(1024);
134 }
135 }
136
137 brin.close();
138 isr.close();
139
140 } catch (IOException e) {
141 System.err.println("Error: unable to read from stdin");
142 e.printStackTrace();
143 }
144
145 indexer.finish();
146 }
147
148
149 static public class Indexer extends DefaultHandler
150 {
151 IndexWriter writer_ = null;
152 SAXParser sax_parser_ = null;
153 String doc_tag_level_ = null;
154
155 Stack stack_ = null;
156 String path_ = "";
157
158 Document current_doc_ = null;
159 String current_node_ = "";
160 String current_doc_oid_ = "";
161 String indexable_current_node_ = "";
162 String current_contents_ = "";
163
164 protected String file_id_ = null;
165
166 /** pass in true if want to create a new index, false if want to use the existing one */
167 public Indexer (String doc_tag_level, File index_dir, boolean create)
168 {
169 doc_tag_level_ = doc_tag_level;
170
171 try {
172 stack_ = new Stack();
173 SAXParserFactory sax_factory = SAXParserFactory.newInstance();
174 sax_parser_ = sax_factory.newSAXParser();
175
176 XMLReader reader = sax_parser_.getXMLReader();
177 reader.setFeature("http://xml.org/sax/features/validation", false);
178
179 writer_ = new IndexWriter(index_dir.getPath(), new StandardAnalyzer(), create);
180 // by default, will only index 10,000 words per document
181 // Can throw out_of_memory errors
182 writer_.setMaxFieldLength(Integer.MAX_VALUE);
183 if (create) {
184 writer_.optimize();
185 }
186 }
187 catch (Exception e) {
188 // We need to know if creating/opening the index fails
189 e.printStackTrace();
190 }
191 }
192
193 /** index one document */
194 public void index (String file_id, File file)
195 {
196 file_id_ = file_id;
197 path_ = "";
198 String base_path = file.getPath();
199 base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
200
201 try {
202 sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
203 }
204 catch (Exception e) {
205 println("parse error:");
206 e.printStackTrace();
207 }
208 }
209
210 /** index one document stored as string*/
211 public void index (String xml_text)
212 {
213 file_id_ = "<xml doc on stdin>";
214 path_ = "";
215
216 try {
217 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
218 }
219 catch (Exception e) {
220 println("parse error:");
221 e.printStackTrace();
222 }
223 }
224
225 public void finish()
226 {
227 /** optimise the index */
228 try {
229 writer_.optimize();
230 writer_.close();
231 }
232 catch (Exception e) {
233 }
234 }
235
236 protected void print(String s)
237 {
238 System.out.print(s);
239 }
240
241 protected void println(String s)
242 {
243 System.out.println(s);
244 }
245
246 public void startDocument() throws SAXException
247 {
248 println("Starting to index " + file_id_);
249 print("[");
250 }
251
252 public void endDocument() throws SAXException
253 {
254 println("]");
255 println("... indexing finished.");
256 }
257
258 public void startElement(String uri, String localName, String qName, Attributes atts)
259 throws SAXException
260 {
261 path_ = appendPathLink(path_, qName, atts);
262
263 if (qName.equals(doc_tag_level_)) {
264 pushOnStack(); // start new doc
265 current_node_ = qName;
266
267 String node_id = atts.getValue("gs2:id");
268 print(" " + qName + ": " + node_id );
269 current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED));
270
271 current_doc_oid_ = atts.getValue("gs2:docOID");
272 current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.UN_TOKENIZED));
273 }
274
275 if (isIndexable(atts)) {
276 indexable_current_node_ = qName;
277 }
278 else {
279 indexable_current_node_ = "";
280 }
281 }
282
283 public static boolean isIndexable(Attributes atts)
284 {
285 boolean is_indexable = false;
286
287 String index = atts.getValue("index");
288 if (index!=null) {
289 if (index.equals("1")) {
290 is_indexable = true;
291 }
292 }
293 return is_indexable;
294 }
295
296 public void endElement(String uri, String localName, String qName) throws SAXException
297 {
298 if (qName.equals(indexable_current_node_))
299 {
300 current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
301 // We only need the term vector for the TX field
302 if (!qName.equals("TX"))
303 {
304 current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO));
305 }
306
307 current_contents_ = "";
308 }
309
310 if (qName.equals(doc_tag_level_)) {
311 try {
312 writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_);
313 }
314 catch (java.io.IOException e) {
315 e.printStackTrace();
316 }
317 popOffStack(); // end document
318 }
319
320 path_ = removePathLink(path_);
321 }
322
323 public void characters(char ch[], int start, int length) throws SAXException
324 {
325 String data = new String(ch, start, length).trim();
326 if (data.length() > 0 ) {
327 current_contents_ += data;
328 }
329 }
330
331 protected String appendPathLink(String path, String qName, Attributes atts)
332 {
333
334 path = path + "/"+qName;
335 if (atts.getLength()>0) {
336 String id = atts.getValue("gs2:id");
337 if (id != null) {
338 path += "[@gs2:id='"+id+"']";
339 }
340 else {
341 id = atts.getValue("gs3:id");
342 if (id != null) {
343 path += "[@gs3:id='"+id+"']";
344 }
345 }
346 }
347 return path;
348 }
349
350 protected String removePathLink(String path)
351 {
352
353 int i=path.lastIndexOf('/');
354 if (i==-1) {
355 path="";
356 } else {
357 path = path.substring(0, i);
358 }
359 return path;
360 }
361
362
363 /** these are what we save on the stack */
364 private class MyDocument
365 {
366 public Document doc = null;
367 public String contents = null;
368 public String tagname = "";
369
370 }
371
372
373 protected void pushOnStack()
374 {
375 if (current_doc_ != null) {
376 MyDocument save = new MyDocument();
377 save.doc = current_doc_;
378 save.contents = current_contents_;
379 save.tagname = current_node_;
380 stack_.push(save);
381 }
382 current_doc_ = new Document();
383 current_contents_ = "";
384 current_node_ = "";
385 }
386
387 protected void popOffStack()
388 {
389 if (!stack_.empty()) {
390 MyDocument saved = (MyDocument)stack_.pop();
391 current_doc_ = saved.doc;
392 current_contents_ = saved.contents;
393 current_node_ = saved.tagname;
394 } else {
395 current_doc_ = new Document();
396 current_contents_ = "";
397 current_node_ = "";
398 }
399 }
400 }
401}
Note: See TracBrowser for help on using the repository browser.