source: indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java@ 16437

Last change on this file since 16437 was 16437, checked in by mdewsnip, 16 years ago

Now stores the new "docOID" value in the index, to help support incremental building.

  • Property svn:keywords set to Author Date Id Revision
File size: 9.4 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneIndexer.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26
27package org.greenstone.LuceneWrapper;
28
29
30import java.io.*;
31import java.util.Vector;
32
33import org.xml.sax.Attributes;
34import org.xml.sax.helpers.DefaultHandler;
35import org.xml.sax.InputSource;
36import org.xml.sax.SAXException;
37import org.xml.sax.XMLReader;
38
39import javax.xml.parsers.SAXParser;
40import javax.xml.parsers.SAXParserFactory;
41
42import org.apache.lucene.document.Document;
43import org.apache.lucene.document.Field;
44import org.apache.lucene.index.IndexWriter;
45import org.apache.lucene.analysis.standard.StandardAnalyzer;
46
47import java.util.Stack;
48import java.io.FileInputStream;
49import java.io.File;
50import java.io.StringReader;
51import java.net.URL;
52
53
54/**
55 * class for indexing XML generated by lucenebuildproc.pm
56 */
57
58public class GS2LuceneIndexer {
59
60 public static void main (String args[]) throws Exception
61 {
62
63 int verbosity = 1;
64 // Default is to edit the existing index
65 boolean create_new_index = false;
66
67 Vector filtered_args = new Vector();
68
69 int argc = args.length;
70 int i = 0;
71 while (i<argc) {
72 if (args[i].startsWith("-")) {
73
74 // -removeold causes the existing index to be overwritten
75 if (args[i].equals("-removeold")) {
76 create_new_index = true;
77 }
78
79 // -verbosity [num]
80 else if (args[i].equals("-verbosity")) {
81 i++;
82 if (i<argc) {
83 verbosity = Integer.parseInt(args[i]);
84 }
85 }
86 else {
87 System.out.println("Unrecognised option: " + args[i]);
88 }
89 }
90 else {
91 filtered_args.add((Object)args[i]);
92 }
93 i++;
94 }
95
96 if (filtered_args.size() != 3) {
97 System.out.println("Usage: java GS2LuceneIndexer [-removeold|-verbosity [num]] doc-tag-level building_dir index");
98 return;
99 }
100
101 String doc_tag_level = (String)filtered_args.get(0);
102 String building_dirname = (String)filtered_args.get(1);
103 String index_dirname = (String)filtered_args.get(2);
104
105 String import_dirname = building_dirname + File.separator + "text";
106
107 File import_dir = new File(import_dirname);
108 File building_dir = new File(building_dirname);
109
110 if (!import_dir.exists()) {
111 System.out.println("Couldn't find import directory: "+import_dirname);
112 return;
113 }
114
115 File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator);
116 idx_dir.mkdir();
117
118 // Set up indexer
119 Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index);
120
121 // Read from stdin the files to process
122 try {
123 InputStreamReader isr = new InputStreamReader(System.in, "UTF-8");
124 BufferedReader brin = new BufferedReader(isr);
125
126 StringBuffer xml_text = new StringBuffer(1024);
127 String line = null;
128 while ((line = brin.readLine()) != null) {
129 xml_text.append(line);
130 if (line.startsWith("</Doc>")) {
131 indexer.index(xml_text.toString());
132 xml_text = new StringBuffer(1024);
133 }
134 }
135
136 brin.close();
137 isr.close();
138
139 } catch (IOException e) {
140 System.err.println("Error: unable to read from stdin");
141 e.printStackTrace();
142 }
143
144 indexer.finish();
145 }
146
147
148 static public class Indexer extends DefaultHandler
149 {
150 IndexWriter writer_ = null;
151 SAXParser sax_parser_ = null;
152 String doc_tag_level_ = null;
153
154 Stack stack_ = null;
155 String path_ = "";
156
157 Document current_doc_ = null;
158 String current_node_ = "";
159 String indexable_current_node_ = "";
160 String current_contents_ = "";
161
162 protected String file_id_ = null;
163
164 /** pass in true if want to create a new index, false if want to use the existing one */
165 public Indexer (String doc_tag_level, File index_dir, boolean create)
166 {
167 doc_tag_level_ = doc_tag_level;
168
169 try {
170 stack_ = new Stack();
171 SAXParserFactory sax_factory = SAXParserFactory.newInstance();
172 sax_parser_ = sax_factory.newSAXParser();
173
174 XMLReader reader = sax_parser_.getXMLReader();
175 reader.setFeature("http://xml.org/sax/features/validation", false);
176
177 writer_ = new IndexWriter(index_dir.getPath(), new StandardAnalyzer(), create);
178 // by default, will only index 10,000 words per document
179 // Can throw out_of_memory errors
180 writer_.setMaxFieldLength(Integer.MAX_VALUE);
181 if (create) {
182 writer_.optimize();
183 }
184
185 } catch (Exception e) {
186 // do nothing!
187 }
188 }
189
190 /** index one document */
191 public void index (String file_id, File file)
192 {
193 file_id_ = file_id;
194 path_ = "";
195 String base_path = file.getPath();
196 base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
197
198 try {
199 sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
200 }
201 catch (Exception e) {
202 println("parse error:");
203 e.printStackTrace();
204 }
205 }
206
207 /** index one document stored as string*/
208 public void index (String xml_text)
209 {
210 file_id_ = "<xml doc on stdin>";
211 path_ = "";
212
213 try {
214 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
215 }
216 catch (Exception e) {
217 println("parse error:");
218 e.printStackTrace();
219 }
220 }
221
222 public void finish()
223 {
224 /** optimise the index */
225 try {
226 writer_.optimize();
227 writer_.close();
228 }
229 catch (Exception e) {
230 }
231 }
232
233 protected void print(String s)
234 {
235 System.out.print(s);
236 }
237
238 protected void println(String s)
239 {
240 System.out.println(s);
241 }
242
243 public void startDocument() throws SAXException
244 {
245 println("Starting to index " + file_id_);
246 print("[");
247 }
248
249 public void endDocument() throws SAXException
250 {
251 println("]");
252 println("... indexing finished.");
253 }
254
255 public void startElement(String uri, String localName, String qName, Attributes atts)
256 throws SAXException
257 {
258 path_ = appendPathLink(path_, qName, atts);
259
260 if (qName.equals(doc_tag_level_)) {
261 pushOnStack(); // start new doc
262 current_node_ = qName;
263
264 String node_id = atts.getValue("gs2:id");
265 print(" " + qName + ": " + node_id );
266 current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED));
267
268 String current_doc_oid_ = atts.getValue("gs2:docOID");
269 current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.UN_TOKENIZED));
270 }
271
272 if (XMLTagInfo.isIndexable(atts)) {
273 indexable_current_node_ = qName;
274 }
275 else {
276 indexable_current_node_ = "";
277 }
278
279 }
280
281 public void endElement(String uri, String localName, String qName) throws SAXException
282 {
283 if (qName.equals(indexable_current_node_))
284 {
285 current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
286 // We only need the term vector for the TX field
287 if (!qName.equals("TX"))
288 {
289 current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO));
290 }
291
292 current_contents_ = "";
293 }
294
295 if (qName.equals(doc_tag_level_)) {
296 try {
297 writer_.addDocument(current_doc_);
298 }
299 catch (java.io.IOException e) {
300 e.printStackTrace();
301 }
302 popOffStack(); // end document
303 }
304
305 path_ = removePathLink(path_);
306 }
307
308 public void characters(char ch[], int start, int length) throws SAXException
309 {
310 String data = new String(ch, start, length).trim();
311 if (data.length() > 0 ) {
312 current_contents_ += data;
313 }
314 }
315
316 protected String appendPathLink(String path, String qName, Attributes atts)
317 {
318
319 path = path + "/"+qName;
320 if (atts.getLength()>0) {
321 String id = atts.getValue("gs2:id");
322 if (id != null) {
323 path += "[@gs2:id='"+id+"']";
324 }
325 else {
326 id = atts.getValue("gs3:id");
327 if (id != null) {
328 path += "[@gs3:id='"+id+"']";
329 }
330 }
331 }
332 return path;
333 }
334
335 protected String removePathLink(String path)
336 {
337
338 int i=path.lastIndexOf('/');
339 if (i==-1) {
340 path="";
341 } else {
342 path = path.substring(0, i);
343 }
344 return path;
345 }
346
347
348 /** these are what we save on the stack */
349 private class MyDocument
350 {
351 public Document doc = null;
352 public String contents = null;
353 public String tagname = "";
354
355 }
356
357
358 protected void pushOnStack()
359 {
360 if (current_doc_ != null) {
361 MyDocument save = new MyDocument();
362 save.doc = current_doc_;
363 save.contents = current_contents_;
364 save.tagname = current_node_;
365 stack_.push(save);
366 }
367 current_doc_ = new Document();
368 current_contents_ = "";
369 current_node_ = "";
370 }
371
372 protected void popOffStack()
373 {
374 if (!stack_.empty()) {
375 MyDocument saved = (MyDocument)stack_.pop();
376 current_doc_ = saved.doc;
377 current_contents_ = saved.contents;
378 current_node_ = saved.tagname;
379 } else {
380 current_doc_ = new Document();
381 current_contents_ = "";
382 current_node_ = "";
383 }
384 }
385 }
386}
Note: See TracBrowser for help on using the repository browser.