Context Navigation

source: indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java@ 18444

Last change on this file since 18444 was 18444, checked in by davidb, 15 years ago
Modifications for incremental building to support files that need to be deleted
Property svn:keywords set to `Author Date Id Revision`
File size: 11.7 KB

Line
1	/**********************************************************************
2	*
3	* GS2LuceneIndexer.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26
27	package org.greenstone.LuceneWrapper;
28
29
30	import java.io.*;
31	import java.util.Vector;
32
33	import org.xml.sax.Attributes;
34	import org.xml.sax.helpers.DefaultHandler;
35	import org.xml.sax.InputSource;
36	import org.xml.sax.SAXException;
37	import org.xml.sax.XMLReader;
38
39	import javax.xml.parsers.SAXParser;
40	import javax.xml.parsers.SAXParserFactory;
41
42	import org.apache.lucene.document.Document;
43	import org.apache.lucene.document.Field;
44	import org.apache.lucene.index.IndexWriter;
45	import org.apache.lucene.index.Term;
46	import org.apache.lucene.analysis.Analyzer;
47
48	import java.util.Stack;
49	import java.io.FileInputStream;
50	import java.io.File;
51	import java.io.StringReader;
52	import java.net.URL;
53
54
55	/**
56	* class for indexing XML generated by lucenebuildproc.pm
57	*/
58
59	public class GS2LuceneIndexer {
60
61	protected static boolean debug = false;
62
63	protected static void debug(String message)
64	{
65	if (debug) {
66	System.err.println(message);
67	}
68	}
69
70
71	public static void main (String args[]) throws Exception
72	{
73	int verbosity = 1;
74	// Default is to edit the existing index
75	boolean create_new_index = false;
76
77	Vector filtered_args = new Vector();
78
79	int argc = args.length;
80	int i = 0;
81	while (i<argc) {
82	if (args[i].startsWith("-")) {
83
84	// -removeold causes the existing index to be overwritten
85	if (args[i].equals("-removeold")) {
86	create_new_index = true;
87	}
88
89	// -verbosity [num]
90	else if (args[i].equals("-verbosity")) {
91	i++;
92	if (i<argc) {
93	verbosity = Integer.parseInt(args[i]);
94	if (verbosity>=5) {
95	debug = true;
96	}
97	}
98	}
99	else if (args[i].equals("-debug")) {
100	debug = true;
101	}
102	else {
103	System.out.println("Unrecognised option: " + args[i]);
104	}
105	}
106	else {
107	filtered_args.add((Object)args[i]);
108	}
109	i++;
110	}
111
112	if (filtered_args.size() != 3) {
113	System.out.println("Usage: java GS2LuceneIndexer [-removeold\|-verbosity [num]] doc-tag-level building_dir index");
114	return;
115	}
116
117	String doc_tag_level = (String)filtered_args.get(0);
118	String building_dirname = (String)filtered_args.get(1);
119	String index_dirname = (String)filtered_args.get(2);
120
121	String import_dirname = building_dirname + File.separator + "text";
122
123	File import_dir = new File(import_dirname);
124	File building_dir = new File(building_dirname);
125
126	if (!import_dir.exists()) {
127	System.out.println("Couldn't find import directory: "+import_dirname);
128	return;
129	}
130
131	File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator);
132	idx_dir.mkdir();
133
134	// Set up indexer
135	Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index);
136
137	// Read from stdin the files to process
138	try {
139	InputStreamReader isr = new InputStreamReader(System.in, "UTF-8");
140	BufferedReader brin = new BufferedReader(isr);
141
142	StringBuffer xml_text = new StringBuffer(1024);
143	String line = null;
144	while ((line = brin.readLine()) != null) {
145	xml_text.append(line);
146
147	debug("Got line " + line);
148
149	if (line.endsWith("</Delete>")) {
150
151	indexer.delete(xml_text.toString());
152	xml_text = new StringBuffer(1024);
153	}
154	else if (line.startsWith("</Doc>")) {
155	indexer.index(xml_text.toString());
156	xml_text = new StringBuffer(1024);
157	}
158	}
159
160	brin.close();
161	isr.close();
162
163	} catch (IOException e) {
164	System.err.println("Error: unable to read from stdin");
165	e.printStackTrace();
166	}
167
168	indexer.finish();
169	}
170
171
172	static public class Indexer extends DefaultHandler
173	{
174	IndexWriter writer_ = null;
175	Analyzer analyzer_ = null;
176	SAXParser sax_parser_ = null;
177	String doc_tag_level_ = null;
178
179	Stack stack_ = null;
180	String path_ = "";
181
182	Document current_doc_ = null;
183	String current_node_ = "";
184	String current_doc_oid_ = "";
185	String indexable_current_node_ = "";
186	String current_contents_ = "";
187
188	String mode_ = "";
189	protected String file_id_ = null;
190
191	static private String[] stop_words = GS2Analyzer.STOP_WORDS;
192
193
194	/** pass in true if want to create a new index, false if want to use the existing one */
195	public Indexer (String doc_tag_level, File index_dir, boolean create)
196	{
197	doc_tag_level_ = doc_tag_level;
198
199	try {
200	stack_ = new Stack();
201	SAXParserFactory sax_factory = SAXParserFactory.newInstance();
202	sax_parser_ = sax_factory.newSAXParser();
203
204	XMLReader reader = sax_parser_.getXMLReader();
205	reader.setFeature("http://xml.org/sax/features/validation", false);
206
207	analyzer_ = new GS2Analyzer(stop_words);
208
209	writer_ = new IndexWriter(index_dir.getPath(), analyzer_, create);
210	// by default, will only index 10,000 words per document
211	// Can throw out_of_memory errors
212	writer_.setMaxFieldLength(Integer.MAX_VALUE);
213	if (create) {
214	writer_.optimize();
215	}
216	}
217	catch (Exception e) {
218	// We need to know if creating/opening the index fails
219	e.printStackTrace();
220	}
221	}
222
223	/** index one document */
224	public void index (String file_id, File file)
225	{
226	mode_ = "index";
227	file_id_ = file_id;
228	path_ = "";
229	String base_path = file.getPath();
230	base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
231
232	try {
233	sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
234	}
235	catch (Exception e) {
236	println("parse error:");
237	e.printStackTrace();
238	}
239	}
240
241	/** index one document stored as string*/
242	public void index (String xml_text)
243	{
244	mode_ = "index";
245	file_id_ = "<xml doc on stdin>";
246	path_ = "";
247
248	try {
249	sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
250	}
251	catch (Exception e) {
252	println("parse error:");
253	e.printStackTrace();
254	}
255	}
256
257	/** delete one document, based on doc_id in <Delete>doc_id</Delete> */
258	public void delete(String xml_text)
259	{
260	mode_ = "delete";
261	file_id_ = "<delete doc>";
262	path_ = "";
263
264	try {
265	sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
266	}
267	catch (Exception e) {
268	println("parse error:");
269	e.printStackTrace();
270	}
271	}
272
273	public void finish()
274	{
275	/** optimise the index */
276	try {
277	writer_.optimize();
278	writer_.close();
279	}
280	catch (Exception e) {
281	}
282	}
283
284	protected void print(String s)
285	{
286	System.out.print(s);
287	}
288
289	protected void println(String s)
290	{
291	System.out.println(s);
292	}
293
294	public void startDocument() throws SAXException
295	{
296	if (mode_.equals("index")) {
297	println("Starting to index " + file_id_);
298	print("[");
299	}
300	}
301
302	public void endDocument() throws SAXException
303	{
304	if (mode_.equals("index")) {
305	println("]");
306	println("... indexing finished.");
307	}
308	}
309
310	public void startElement(String uri, String localName, String qName, Attributes atts)
311	throws SAXException
312	{
313	if (mode_.equals("index")) {
314	path_ = appendPathLink(path_, qName, atts);
315
316	if (qName.equals(doc_tag_level_)) {
317	pushOnStack(); // start new doc
318	current_node_ = qName;
319
320	String node_id = atts.getValue("gs2:id");
321	print(" " + qName + ": " + node_id );
322	current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED));
323
324	current_doc_oid_ = atts.getValue("gs2:docOID");
325	current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.UN_TOKENIZED));
326	}
327
328	if (isIndexable(atts)) {
329	indexable_current_node_ = qName;
330	}
331	else {
332	indexable_current_node_ = "";
333	}
334	}
335	}
336
337	public static boolean isIndexable(Attributes atts)
338	{
339	boolean is_indexable = false;
340
341	String index = atts.getValue("index");
342	if (index!=null) {
343	if (index.equals("1")) {
344	is_indexable = true;
345	}
346	}
347	return is_indexable;
348	}
349
350	public void endElement(String uri, String localName, String qName) throws SAXException
351	{
352	if (mode_.equals("index")) {
353	if (qName.equals(indexable_current_node_))
354	{
355	current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
356	// The byXX fields are used for sorting search results
357	// We don't want to do that for Text or AllFields fields
358	// They need to be untokenised for sorting
359	if (!qName.equals("TX") && !qName.equals("ZZ"))
360	{
361	current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO));
362	}
363
364	current_contents_ = "";
365	}
366
367	if (qName.equals(doc_tag_level_)) {
368	try {
369	writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_, analyzer_);
370	}
371	catch (java.io.IOException e) {
372	e.printStackTrace();
373	}
374	popOffStack(); // end document
375	}
376
377	path_ = removePathLink(path_);
378	}
379	else if (mode_.equals("delete")) {
380	if (qName.equals("Delete")) {
381	try {
382	deleteDocument(current_contents_);
383	current_contents_ = "";
384	}
385	catch (java.io.IOException e) {
386	e.printStackTrace();
387	}
388	}
389	}
390	}
391
392	public void characters(char ch[], int start, int length) throws SAXException
393	{
394	String data = new String(ch, start, length).trim();
395	if (data.length() > 0 ) {
396	current_contents_ += data;
397	}
398	}
399
400	protected String appendPathLink(String path, String qName, Attributes atts)
401	{
402
403	path = path + "/"+qName;
404	if (atts.getLength()>0) {
405	String id = atts.getValue("gs2:id");
406	if (id != null) {
407	path += "[@gs2:id='"+id+"']";
408	}
409	else {
410	id = atts.getValue("gs3:id");
411	if (id != null) {
412	path += "[@gs3:id='"+id+"']";
413	}
414	}
415	}
416	return path;
417	}
418
419	protected String removePathLink(String path)
420	{
421
422	int i=path.lastIndexOf('/');
423	if (i==-1) {
424	path="";
425	} else {
426	path = path.substring(0, i);
427	}
428	return path;
429	}
430
431
432	/** these are what we save on the stack */
433	private class MyDocument
434	{
435	public Document doc = null;
436	public String contents = null;
437	public String tagname = "";
438
439	}
440
441
442	protected void pushOnStack()
443	{
444	if (current_doc_ != null) {
445	MyDocument save = new MyDocument();
446	save.doc = current_doc_;
447	save.contents = current_contents_;
448	save.tagname = current_node_;
449	stack_.push(save);
450	}
451	current_doc_ = new Document();
452	current_contents_ = "";
453	current_node_ = "";
454	}
455
456	protected void popOffStack()
457	{
458	if (!stack_.empty()) {
459	MyDocument saved = (MyDocument)stack_.pop();
460	current_doc_ = saved.doc;
461	current_contents_ = saved.contents;
462	current_node_ = saved.tagname;
463	} else {
464	current_doc_ = new Document();
465	current_contents_ = "";
466	current_node_ = "";
467	}
468	}
469
470
471	protected void deleteDocument(String doc_id)
472	throws IOException
473	{
474	debug("GS2LuceneDelete.deleteDocument(" + doc_id + ")");
475	debug("- Initial number of documents in index: " + writer_.docCount());
476	writer_.deleteDocuments(new Term("docOID", doc_id));
477	debug("- Final number of documents in index: " + writer_.docCount());
478	}
479
480
481	}
482	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: