Context Navigation

source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java@ 24731

Last change on this file since 24731 was 24731, checked in by sjm84, 13 years ago
Lucene 3.x version of code accidentally commited rolling back to 2.x compatible version
Property svn:keywords set to `Author Date Id Revision`
File size: 11.9 KB

Line
1	/**********************************************************************
2	*
3	* GS2LuceneIndexer.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26
27	package org.greenstone.LuceneWrapper;
28
29
30	import java.io.*;
31	import java.util.Vector;
32
33	import org.xml.sax.Attributes;
34	import org.xml.sax.helpers.DefaultHandler;
35	import org.xml.sax.InputSource;
36	import org.xml.sax.SAXException;
37	import org.xml.sax.XMLReader;
38
39	import javax.xml.parsers.SAXParser;
40	import javax.xml.parsers.SAXParserFactory;
41
42	import org.apache.lucene.document.Document;
43	import org.apache.lucene.document.Field;
44	import org.apache.lucene.index.IndexWriter;
45	import org.apache.lucene.index.Term;
46	import org.apache.lucene.analysis.Analyzer;
47
48	import java.util.Stack;
49	import java.io.FileInputStream;
50	import java.io.File;
51	import java.io.StringReader;
52	import java.net.URL;
53
54
55	/**
56	* class for indexing XML generated by lucenebuildproc.pm
57	*/
58
59	public class GS2LuceneIndexer {
60
61	protected static boolean debug = false;
62
63	protected static void debug(String message)
64	{
65	if (debug) {
66	System.err.println(message);
67	}
68	}
69
70
71	public static void main (String args[]) throws Exception
72	{
73	int verbosity = 1;
74	// Default is to edit the existing index
75	boolean create_new_index = false;
76
77	Vector filtered_args = new Vector();
78
79	int argc = args.length;
80	int i = 0;
81	while (i<argc) {
82	if (args[i].startsWith("-")) {
83
84	// -removeold causes the existing index to be overwritten
85	if (args[i].equals("-removeold")) {
86	create_new_index = true;
87	}
88
89	// -verbosity [num]
90	else if (args[i].equals("-verbosity")) {
91	i++;
92	if (i<argc) {
93	verbosity = Integer.parseInt(args[i]);
94	if (verbosity>=5) {
95	debug = true;
96	}
97	}
98	}
99	else if (args[i].equals("-debug")) {
100	debug = true;
101	}
102	else {
103	System.err.println("Unrecognised option: " + args[i]);
104	}
105	}
106	else {
107	filtered_args.add((Object)args[i]);
108	}
109	i++;
110	}
111
112	if (filtered_args.size() != 3) {
113	System.err.println("Usage: java GS2LuceneIndexer [-removeold\|-verbosity [num]] doc-tag-level building_dir index");
114	return;
115	}
116
117	String doc_tag_level = (String)filtered_args.get(0);
118	String building_dirname = (String)filtered_args.get(1);
119	String index_dirname = (String)filtered_args.get(2);
120
121	String import_dirname = building_dirname + File.separator + "text";
122
123	File import_dir = new File(import_dirname);
124	File building_dir = new File(building_dirname);
125
126	if (!import_dir.exists()) {
127	System.err.println("Couldn't find import directory: "+import_dirname);
128	return;
129	}
130
131	File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator);
132	idx_dir.mkdir();
133
134	// Set up indexer
135	Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index);
136
137	// Read from stdin the files to process
138	try {
139	InputStreamReader isr = new InputStreamReader(System.in, "UTF-8");
140	BufferedReader brin = new BufferedReader(isr);
141
142	StringBuffer xml_text = new StringBuffer(1024);
143	String line = null;
144	while ((line = brin.readLine()) != null) {
145	xml_text.append(line);
146	xml_text.append(" ");
147
148	debug("Got line " + line);
149
150	if (line.endsWith("</Delete>")) {
151
152	indexer.delete(xml_text.toString());
153	xml_text = new StringBuffer(1024);
154	}
155	else if (line.startsWith("</Doc>")) {
156	indexer.index(xml_text.toString());
157	xml_text = new StringBuffer(1024);
158	}
159	}
160
161	brin.close();
162	isr.close();
163
164	} catch (IOException e) {
165	System.err.println("Error: unable to read from stdin");
166	e.printStackTrace();
167	}
168
169	indexer.finish();
170	}
171
172
173	static public class Indexer extends DefaultHandler
174	{
175	IndexWriter writer_ = null;
176	Analyzer analyzer_ = null;
177	SAXParser sax_parser_ = null;
178	String doc_tag_level_ = null;
179
180	Stack stack_ = null;
181	String path_ = "";
182
183	Document current_doc_ = null;
184	String current_node_ = "";
185	String current_doc_oid_ = "";
186	String indexable_current_node_ = "";
187	String current_contents_ = "";
188
189	String mode_ = "";
190	protected String file_id_ = null;
191
192	static private String[] stop_words = GS2Analyzer.STOP_WORDS;
193
194
195	/** pass in true if want to create a new index, false if want to use the existing one */
196	public Indexer (String doc_tag_level, File index_dir, boolean create)
197	{
198	doc_tag_level_ = doc_tag_level;
199
200	try {
201	stack_ = new Stack();
202	SAXParserFactory sax_factory = SAXParserFactory.newInstance();
203	sax_parser_ = sax_factory.newSAXParser();
204
205	XMLReader reader = sax_parser_.getXMLReader();
206	reader.setFeature("http://xml.org/sax/features/validation", false);
207
208	analyzer_ = new GS2Analyzer(stop_words);
209
210	writer_ = new IndexWriter(index_dir.getPath(), analyzer_, create);
211	// by default, will only index 10,000 words per document
212	// Can throw out_of_memory errors
213	writer_.setMaxFieldLength(Integer.MAX_VALUE);
214	if (create) {
215	writer_.optimize();
216	}
217	}
218	catch (Exception e) {
219	// We need to know if creating/opening the index fails
220	e.printStackTrace();
221	}
222	}
223
224	/** index one document */
225	public void index (String file_id, File file)
226	{
227	mode_ = "add";
228	file_id_ = file_id;
229	path_ = "";
230	String base_path = file.getPath();
231	base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
232
233	try {
234	sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
235	}
236	catch (Exception e) {
237	println("parse error:");
238	e.printStackTrace();
239	}
240	}
241
242	/** index one document stored as string*/
243	public void index (String xml_text)
244	{
245	mode_ = "add";
246	file_id_ = "<xml doc on stdin>";
247	path_ = "";
248
249	try {
250	sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
251	}
252	catch (Exception e) {
253	println("parse error:");
254	e.printStackTrace();
255	}
256	}
257
258	/** delete one document, based on doc_id in <Delete>doc_id</Delete> */
259	public void delete(String xml_text)
260	{
261	mode_ = "delete";
262	file_id_ = "<delete doc>";
263	path_ = "";
264
265	try {
266	sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
267	}
268	catch (Exception e) {
269	println("parse error:");
270	e.printStackTrace();
271	}
272	}
273
274	public void finish()
275	{
276	/** optimise the index */
277	try {
278	writer_.optimize();
279	writer_.close();
280	}
281	catch (Exception e) {
282	}
283	}
284
285	protected void print(String s)
286	{
287	System.err.print(s);
288	}
289
290	protected void println(String s)
291	{
292	System.err.println(s);
293	}
294
295	public void startDocument() throws SAXException
296	{
297	println("Starting to process " + file_id_);
298	print("[");
299	}
300
301	public void endDocument() throws SAXException
302	{
303	println("]");
304	println("... processing finished.");
305	}
306
307	public void startElement(String uri, String localName, String qName, Attributes atts)
308	throws SAXException
309	{
310	path_ = appendPathLink(path_, qName, atts);
311
312	if (qName.equals(doc_tag_level_)) {
313	mode_ = atts.getValue("gs2:mode");
314
315	pushOnStack(); // start new doc
316	current_node_ = qName;
317
318	//String node_id = atts.getValue("gs2:id");
319	//print(" " + qName + ": " + node_id + " (" + mode_ + ")" );
320	//current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED));
321
322	current_doc_oid_ = atts.getValue("gs2:docOID");
323	print(" " + qName + ": " + current_doc_oid_ + " (" + mode_ + ")" );
324	current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.UN_TOKENIZED));
325	}
326
327	if (isIndexable(atts)) {
328	indexable_current_node_ = qName;
329	}
330	else {
331	indexable_current_node_ = "";
332	}
333	}
334
335	public static boolean isIndexable(Attributes atts)
336	{
337	boolean is_indexable = false;
338
339	String index = atts.getValue("index");
340	if (index!=null) {
341	if (index.equals("1")) {
342	is_indexable = true;
343	}
344	}
345	return is_indexable;
346	}
347
348	public void endElement(String uri, String localName, String qName) throws SAXException
349	{
350	if (mode_.equals("delete")) {
351	try {
352	deleteDocument(current_doc_oid_);
353	}
354	catch (java.io.IOException e) {
355	e.printStackTrace();
356	}
357	}
358	else if (mode_.equals("add") \|\| mode_.equals("update")) {
359	if (qName.equals(indexable_current_node_))
360	{
361	current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
362	// The byXX fields are used for sorting search results
363	// We don't want to do that for Text or AllFields fields
364	// They need to be untokenised for sorting
365	if (!qName.equals("TX") && !qName.equals("ZZ"))
366	{
367	current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO));
368	}
369
370	current_contents_ = "";
371	}
372
373	if (qName.equals(doc_tag_level_)) {
374	try {
375	// perhaps this is more efficient if addDocument()
376	// used for "add" and updateDocument() for "update"
377	writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_, analyzer_);
378	}
379	catch (java.io.IOException e) {
380	e.printStackTrace();
381	}
382	popOffStack(); // end document
383	}
384
385	path_ = removePathLink(path_);
386	}
387	}
388
389	public void characters(char ch[], int start, int length) throws SAXException
390	{
391	String data = new String(ch, start, length).trim();
392	if (data.length() > 0 ) {
393	current_contents_ += data;
394	}
395	}
396
397	protected String appendPathLink(String path, String qName, Attributes atts)
398	{
399
400	path = path + "/"+qName;
401	if (atts.getLength()>0) {
402	// was gs2:id, changed to gs2:docOID --kjdon
403	String id = atts.getValue("gs2:docOID");
404	if (id != null) {
405	path += "[@gs2:docOID='"+id+"']";
406	}
407	else {
408	// is this ever used? not in perl currently
409	id = atts.getValue("gs3:id");
410	if (id != null) {
411	path += "[@gs3:id='"+id+"']";
412	}
413	}
414	}
415	return path;
416	}
417
418	protected String removePathLink(String path)
419	{
420
421	int i=path.lastIndexOf('/');
422	if (i==-1) {
423	path="";
424	} else {
425	path = path.substring(0, i);
426	}
427	return path;
428	}
429
430
431	/** these are what we save on the stack */
432	private class MyDocument
433	{
434	public Document doc = null;
435	public String contents = null;
436	public String tagname = "";
437
438	}
439
440
441	protected void pushOnStack()
442	{
443	if (current_doc_ != null) {
444	MyDocument save = new MyDocument();
445	save.doc = current_doc_;
446	save.contents = current_contents_;
447	save.tagname = current_node_;
448	stack_.push(save);
449	}
450	current_doc_ = new Document();
451	current_contents_ = "";
452	current_node_ = "";
453	}
454
455	protected void popOffStack()
456	{
457	if (!stack_.empty()) {
458	MyDocument saved = (MyDocument)stack_.pop();
459	current_doc_ = saved.doc;
460	current_contents_ = saved.contents;
461	current_node_ = saved.tagname;
462	} else {
463	current_doc_ = new Document();
464	current_contents_ = "";
465	current_node_ = "";
466	}
467	}
468
469
470	protected void deleteDocument(String doc_id)
471	throws IOException
472	{
473	debug("GS2LuceneDelete.deleteDocument(" + doc_id + ")");
474	debug("- Initial number of documents in index: " + writer_.docCount());
475	writer_.deleteDocuments(new Term("docOID", doc_id));
476	debug("- Final number of documents in index: " + writer_.docCount());
477	}
478
479
480	}
481	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: