source: indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java@ 18132

Last change on this file since 18132 was 18132, checked in by kjdon, 15 years ago

we don't want to store the ZZ field again for sorting

  • Property svn:keywords set to Author Date Id Revision
File size: 10.1 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneIndexer.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26
27package org.greenstone.LuceneWrapper;
28
29
30import java.io.*;
31import java.util.Vector;
32
33import org.xml.sax.Attributes;
34import org.xml.sax.helpers.DefaultHandler;
35import org.xml.sax.InputSource;
36import org.xml.sax.SAXException;
37import org.xml.sax.XMLReader;
38
39import javax.xml.parsers.SAXParser;
40import javax.xml.parsers.SAXParserFactory;
41
42import org.apache.lucene.document.Document;
43import org.apache.lucene.document.Field;
44import org.apache.lucene.index.IndexWriter;
45import org.apache.lucene.index.Term;
46import org.apache.lucene.analysis.Analyzer;
47
48import java.util.Stack;
49import java.io.FileInputStream;
50import java.io.File;
51import java.io.StringReader;
52import java.net.URL;
53
54
55/**
56 * class for indexing XML generated by lucenebuildproc.pm
57 */
58
59public class GS2LuceneIndexer {
60
61 public static void main (String args[]) throws Exception
62 {
63 int verbosity = 1;
64 // Default is to edit the existing index
65 boolean create_new_index = false;
66
67 Vector filtered_args = new Vector();
68
69 int argc = args.length;
70 int i = 0;
71 while (i<argc) {
72 if (args[i].startsWith("-")) {
73
74 // -removeold causes the existing index to be overwritten
75 if (args[i].equals("-removeold")) {
76 create_new_index = true;
77 }
78
79 // -verbosity [num]
80 else if (args[i].equals("-verbosity")) {
81 i++;
82 if (i<argc) {
83 verbosity = Integer.parseInt(args[i]);
84 }
85 }
86 else {
87 System.out.println("Unrecognised option: " + args[i]);
88 }
89 }
90 else {
91 filtered_args.add((Object)args[i]);
92 }
93 i++;
94 }
95
96 if (filtered_args.size() != 3) {
97 System.out.println("Usage: java GS2LuceneIndexer [-removeold|-verbosity [num]] doc-tag-level building_dir index");
98 return;
99 }
100
101 String doc_tag_level = (String)filtered_args.get(0);
102 String building_dirname = (String)filtered_args.get(1);
103 String index_dirname = (String)filtered_args.get(2);
104
105 String import_dirname = building_dirname + File.separator + "text";
106
107 File import_dir = new File(import_dirname);
108 File building_dir = new File(building_dirname);
109
110 if (!import_dir.exists()) {
111 System.out.println("Couldn't find import directory: "+import_dirname);
112 return;
113 }
114
115 File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator);
116 idx_dir.mkdir();
117
118 // Set up indexer
119 Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index);
120
121 // Read from stdin the files to process
122 try {
123 InputStreamReader isr = new InputStreamReader(System.in, "UTF-8");
124 BufferedReader brin = new BufferedReader(isr);
125
126 StringBuffer xml_text = new StringBuffer(1024);
127 String line = null;
128 while ((line = brin.readLine()) != null) {
129 xml_text.append(line);
130 if (line.startsWith("</Doc>")) {
131 indexer.index(xml_text.toString());
132 xml_text = new StringBuffer(1024);
133 }
134 }
135
136 brin.close();
137 isr.close();
138
139 } catch (IOException e) {
140 System.err.println("Error: unable to read from stdin");
141 e.printStackTrace();
142 }
143
144 indexer.finish();
145 }
146
147
148 static public class Indexer extends DefaultHandler
149 {
150 IndexWriter writer_ = null;
151 Analyzer analyzer_ = null;
152 SAXParser sax_parser_ = null;
153 String doc_tag_level_ = null;
154
155 Stack stack_ = null;
156 String path_ = "";
157
158 Document current_doc_ = null;
159 String current_node_ = "";
160 String current_doc_oid_ = "";
161 String indexable_current_node_ = "";
162 String current_contents_ = "";
163
164 protected String file_id_ = null;
165
166 static private String[] stop_words = GS2Analyzer.STOP_WORDS;
167
168 /** pass in true if want to create a new index, false if want to use the existing one */
169 public Indexer (String doc_tag_level, File index_dir, boolean create)
170 {
171 doc_tag_level_ = doc_tag_level;
172
173 try {
174 stack_ = new Stack();
175 SAXParserFactory sax_factory = SAXParserFactory.newInstance();
176 sax_parser_ = sax_factory.newSAXParser();
177
178 XMLReader reader = sax_parser_.getXMLReader();
179 reader.setFeature("http://xml.org/sax/features/validation", false);
180
181 analyzer_ = new GS2Analyzer(stop_words);
182
183 writer_ = new IndexWriter(index_dir.getPath(), analyzer_, create);
184 // by default, will only index 10,000 words per document
185 // Can throw out_of_memory errors
186 writer_.setMaxFieldLength(Integer.MAX_VALUE);
187 if (create) {
188 writer_.optimize();
189 }
190 }
191 catch (Exception e) {
192 // We need to know if creating/opening the index fails
193 e.printStackTrace();
194 }
195 }
196
197 /** index one document */
198 public void index (String file_id, File file)
199 {
200 file_id_ = file_id;
201 path_ = "";
202 String base_path = file.getPath();
203 base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
204
205 try {
206 sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
207 }
208 catch (Exception e) {
209 println("parse error:");
210 e.printStackTrace();
211 }
212 }
213
214 /** index one document stored as string*/
215 public void index (String xml_text)
216 {
217 file_id_ = "<xml doc on stdin>";
218 path_ = "";
219
220 try {
221 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
222 }
223 catch (Exception e) {
224 println("parse error:");
225 e.printStackTrace();
226 }
227 }
228
229 public void finish()
230 {
231 /** optimise the index */
232 try {
233 writer_.optimize();
234 writer_.close();
235 }
236 catch (Exception e) {
237 }
238 }
239
240 protected void print(String s)
241 {
242 System.out.print(s);
243 }
244
245 protected void println(String s)
246 {
247 System.out.println(s);
248 }
249
250 public void startDocument() throws SAXException
251 {
252 println("Starting to index " + file_id_);
253 print("[");
254 }
255
256 public void endDocument() throws SAXException
257 {
258 println("]");
259 println("... indexing finished.");
260 }
261
262 public void startElement(String uri, String localName, String qName, Attributes atts)
263 throws SAXException
264 {
265 path_ = appendPathLink(path_, qName, atts);
266
267 if (qName.equals(doc_tag_level_)) {
268 pushOnStack(); // start new doc
269 current_node_ = qName;
270
271 String node_id = atts.getValue("gs2:id");
272 print(" " + qName + ": " + node_id );
273 current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED));
274
275 current_doc_oid_ = atts.getValue("gs2:docOID");
276 current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.UN_TOKENIZED));
277 }
278
279 if (isIndexable(atts)) {
280 indexable_current_node_ = qName;
281 }
282 else {
283 indexable_current_node_ = "";
284 }
285 }
286
287 public static boolean isIndexable(Attributes atts)
288 {
289 boolean is_indexable = false;
290
291 String index = atts.getValue("index");
292 if (index!=null) {
293 if (index.equals("1")) {
294 is_indexable = true;
295 }
296 }
297 return is_indexable;
298 }
299
300 public void endElement(String uri, String localName, String qName) throws SAXException
301 {
302 if (qName.equals(indexable_current_node_))
303 {
304 current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
305 // The byXX fields are used for sorting search results
306 // We don't want to do that for Text or AllFields fields
307 // They need to be untokenised for sorting
308 if (!qName.equals("TX") && !qName.equals("ZZ"))
309 {
310 current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO));
311 }
312
313 current_contents_ = "";
314 }
315
316 if (qName.equals(doc_tag_level_)) {
317 try {
318 writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_, analyzer_);
319 }
320 catch (java.io.IOException e) {
321 e.printStackTrace();
322 }
323 popOffStack(); // end document
324 }
325
326 path_ = removePathLink(path_);
327 }
328
329 public void characters(char ch[], int start, int length) throws SAXException
330 {
331 String data = new String(ch, start, length).trim();
332 if (data.length() > 0 ) {
333 current_contents_ += data;
334 }
335 }
336
337 protected String appendPathLink(String path, String qName, Attributes atts)
338 {
339
340 path = path + "/"+qName;
341 if (atts.getLength()>0) {
342 String id = atts.getValue("gs2:id");
343 if (id != null) {
344 path += "[@gs2:id='"+id+"']";
345 }
346 else {
347 id = atts.getValue("gs3:id");
348 if (id != null) {
349 path += "[@gs3:id='"+id+"']";
350 }
351 }
352 }
353 return path;
354 }
355
356 protected String removePathLink(String path)
357 {
358
359 int i=path.lastIndexOf('/');
360 if (i==-1) {
361 path="";
362 } else {
363 path = path.substring(0, i);
364 }
365 return path;
366 }
367
368
369 /** these are what we save on the stack */
370 private class MyDocument
371 {
372 public Document doc = null;
373 public String contents = null;
374 public String tagname = "";
375
376 }
377
378
379 protected void pushOnStack()
380 {
381 if (current_doc_ != null) {
382 MyDocument save = new MyDocument();
383 save.doc = current_doc_;
384 save.contents = current_contents_;
385 save.tagname = current_node_;
386 stack_.push(save);
387 }
388 current_doc_ = new Document();
389 current_contents_ = "";
390 current_node_ = "";
391 }
392
393 protected void popOffStack()
394 {
395 if (!stack_.empty()) {
396 MyDocument saved = (MyDocument)stack_.pop();
397 current_doc_ = saved.doc;
398 current_contents_ = saved.contents;
399 current_node_ = saved.tagname;
400 } else {
401 current_doc_ = new Document();
402 current_contents_ = "";
403 current_node_ = "";
404 }
405 }
406 }
407}
Note: See TracBrowser for help on using the repository browser.