Context Navigation

source: indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java@ 20731

Last change on this file since 20731 was 20731, checked in by kjdon, 15 years ago
removed all use of gs2:id as section ids. Now always use gs2:docOID, which is the same as teh greenstone oid. incremental section ids don't work when it comes to incremental build and deleting documents
Property svn:keywords set to `Author Date Id Revision`
File size: 11.9 KB

Line
1	/**********************************************************************
2	*
3	* GS2LuceneIndexer.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26
27	package org.greenstone.LuceneWrapper;
28
29
30	import java.io.*;
31	import java.util.Vector;
32
33	import org.xml.sax.Attributes;
34	import org.xml.sax.helpers.DefaultHandler;
35	import org.xml.sax.InputSource;
36	import org.xml.sax.SAXException;
37	import org.xml.sax.XMLReader;
38
39	import javax.xml.parsers.SAXParser;
40	import javax.xml.parsers.SAXParserFactory;
41
42	import org.apache.lucene.document.Document;
43	import org.apache.lucene.document.Field;
44	import org.apache.lucene.index.IndexWriter;
45	import org.apache.lucene.index.Term;
46	import org.apache.lucene.analysis.Analyzer;
47
48	import java.util.Stack;
49	import java.io.FileInputStream;
50	import java.io.File;
51	import java.io.StringReader;
52	import java.net.URL;
53
54
55	/**
56	* class for indexing XML generated by lucenebuildproc.pm
57	*/
58
59	public class GS2LuceneIndexer {
60
61	protected static boolean debug = false;
62
63	protected static void debug(String message)
64	{
65	if (debug) {
66	System.err.println(message);
67	}
68	}
69
70
71	public static void main (String args[]) throws Exception
72	{
73	int verbosity = 1;
74	// Default is to edit the existing index
75	boolean create_new_index = false;
76
77	Vector filtered_args = new Vector();
78
79	int argc = args.length;
80	int i = 0;
81	while (i<argc) {
82	if (args[i].startsWith("-")) {
83
84	// -removeold causes the existing index to be overwritten
85	if (args[i].equals("-removeold")) {
86	create_new_index = true;
87	}
88
89	// -verbosity [num]
90	else if (args[i].equals("-verbosity")) {
91	i++;
92	if (i<argc) {
93	verbosity = Integer.parseInt(args[i]);
94	if (verbosity>=5) {
95	debug = true;
96	}
97	}
98	}
99	else if (args[i].equals("-debug")) {
100	debug = true;
101	}
102	else {
103	System.out.println("Unrecognised option: " + args[i]);
104	}
105	}
106	else {
107	filtered_args.add((Object)args[i]);
108	}
109	i++;
110	}
111
112	if (filtered_args.size() != 3) {
113	System.out.println("Usage: java GS2LuceneIndexer [-removeold\|-verbosity [num]] doc-tag-level building_dir index");
114	return;
115	}
116
117	String doc_tag_level = (String)filtered_args.get(0);
118	String building_dirname = (String)filtered_args.get(1);
119	String index_dirname = (String)filtered_args.get(2);
120
121	String import_dirname = building_dirname + File.separator + "text";
122
123	File import_dir = new File(import_dirname);
124	File building_dir = new File(building_dirname);
125
126	if (!import_dir.exists()) {
127	System.out.println("Couldn't find import directory: "+import_dirname);
128	return;
129	}
130
131	File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator);
132	idx_dir.mkdir();
133
134	// Set up indexer
135	Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index);
136
137	// Read from stdin the files to process
138	try {
139	InputStreamReader isr = new InputStreamReader(System.in, "UTF-8");
140	BufferedReader brin = new BufferedReader(isr);
141
142	StringBuffer xml_text = new StringBuffer(1024);
143	String line = null;
144	while ((line = brin.readLine()) != null) {
145	xml_text.append(line);
146	xml_text.append(" ");
147
148	debug("Got line " + line);
149
150	if (line.endsWith("</Delete>")) {
151
152	indexer.delete(xml_text.toString());
153	xml_text = new StringBuffer(1024);
154	}
155	else if (line.startsWith("</Doc>")) {
156	indexer.index(xml_text.toString());
157	xml_text = new StringBuffer(1024);
158	}
159	}
160
161	brin.close();
162	isr.close();
163
164	} catch (IOException e) {
165	System.err.println("Error: unable to read from stdin");
166	e.printStackTrace();
167	}
168
169	indexer.finish();
170	}
171
172
173	static public class Indexer extends DefaultHandler
174	{
175	IndexWriter writer_ = null;
176	Analyzer analyzer_ = null;
177	SAXParser sax_parser_ = null;
178	String doc_tag_level_ = null;
179
180	Stack stack_ = null;
181	String path_ = "";
182
183	Document current_doc_ = null;
184	String current_node_ = "";
185	String current_doc_oid_ = "";
186	String indexable_current_node_ = "";
187	String current_contents_ = "";
188
189	String mode_ = "";
190	protected String file_id_ = null;
191
192	static private String[] stop_words = GS2Analyzer.STOP_WORDS;
193
194
195	/** pass in true if want to create a new index, false if want to use the existing one */
196	public Indexer (String doc_tag_level, File index_dir, boolean create)
197	{
198	doc_tag_level_ = doc_tag_level;
199
200	try {
201	stack_ = new Stack();
202	SAXParserFactory sax_factory = SAXParserFactory.newInstance();
203	sax_parser_ = sax_factory.newSAXParser();
204
205	XMLReader reader = sax_parser_.getXMLReader();
206	reader.setFeature("http://xml.org/sax/features/validation", false);
207
208	analyzer_ = new GS2Analyzer(stop_words);
209
210	writer_ = new IndexWriter(index_dir.getPath(), analyzer_, create);
211	// by default, will only index 10,000 words per document
212	// Can throw out_of_memory errors
213	writer_.setMaxFieldLength(Integer.MAX_VALUE);
214	if (create) {
215	writer_.optimize();
216	}
217	}
218	catch (Exception e) {
219	// We need to know if creating/opening the index fails
220	e.printStackTrace();
221	}
222	}
223
224	/** index one document */
225	public void index (String file_id, File file)
226	{
227	mode_ = "add";
228	file_id_ = file_id;
229	path_ = "";
230	String base_path = file.getPath();
231	base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
232
233	try {
234	sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
235	}
236	catch (Exception e) {
237	println("parse error:");
238	e.printStackTrace();
239	}
240	}
241
242	/** index one document stored as string*/
243	public void index (String xml_text)
244	{
245	mode_ = "add";
246	file_id_ = "<xml doc on stdin>";
247	path_ = "";
248
249	try {
250	sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
251	}
252	catch (Exception e) {
253	println("parse error:");
254	e.printStackTrace();
255	}
256	}
257
258	/** delete one document, based on doc_id in <Delete>doc_id</Delete> */
259	public void delete(String xml_text)
260	{
261	mode_ = "delete";
262	file_id_ = "<delete doc>";
263	path_ = "";
264
265	try {
266	sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
267	}
268	catch (Exception e) {
269	println("parse error:");
270	e.printStackTrace();
271	}
272	}
273
274	public void finish()
275	{
276	/** optimise the index */
277	try {
278	writer_.optimize();
279	writer_.close();
280	}
281	catch (Exception e) {
282	}
283	}
284
285	protected void print(String s)
286	{
287	System.out.print(s);
288	}
289
290	protected void println(String s)
291	{
292	System.out.println(s);
293	}
294
295	public void startDocument() throws SAXException
296	{
297	println("Starting to process " + file_id_);
298	print("[");
299	}
300
301	public void endDocument() throws SAXException
302	{
303	println("]");
304	println("... processing finished.");
305	}
306
307	public void startElement(String uri, String localName, String qName, Attributes atts)
308	throws SAXException
309	{
310	path_ = appendPathLink(path_, qName, atts);
311
312	if (qName.equals(doc_tag_level_)) {
313	mode_ = atts.getValue("gs2:mode");
314
315	pushOnStack(); // start new doc
316	current_node_ = qName;
317
318	//String node_id = atts.getValue("gs2:id");
319	//print(" " + qName + ": " + node_id + " (" + mode_ + ")" );
320	//current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED));
321
322	current_doc_oid_ = atts.getValue("gs2:docOID");
323	print(" " + qName + ": " + current_doc_oid_ + " (" + mode_ + ")" );
324	current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.UN_TOKENIZED));
325	}
326
327	if (isIndexable(atts)) {
328	indexable_current_node_ = qName;
329	}
330	else {
331	indexable_current_node_ = "";
332	}
333	}
334
335	public static boolean isIndexable(Attributes atts)
336	{
337	boolean is_indexable = false;
338
339	String index = atts.getValue("index");
340	if (index!=null) {
341	if (index.equals("1")) {
342	is_indexable = true;
343	}
344	}
345	return is_indexable;
346	}
347
348	public void endElement(String uri, String localName, String qName) throws SAXException
349	{
350	if (mode_.equals("delete")) {
351	try {
352	deleteDocument(current_doc_oid_);
353	}
354	catch (java.io.IOException e) {
355	e.printStackTrace();
356	}
357	}
358	else if (mode_.equals("add") \|\| mode_.equals("update")) {
359	if (qName.equals(indexable_current_node_))
360	{
361	current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
362	// The byXX fields are used for sorting search results
363	// We don't want to do that for Text or AllFields fields
364	// They need to be untokenised for sorting
365	if (!qName.equals("TX") && !qName.equals("ZZ"))
366	{
367	current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO));
368	}
369
370	current_contents_ = "";
371	}
372
373	if (qName.equals(doc_tag_level_)) {
374	try {
375	// perhaps this is more efficient if addDocument()
376	// used for "add" and updateDocument() for "update"
377	writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_, analyzer_);
378	}
379	catch (java.io.IOException e) {
380	e.printStackTrace();
381	}
382	popOffStack(); // end document
383	}
384
385	path_ = removePathLink(path_);
386	}
387	}
388
389	public void characters(char ch[], int start, int length) throws SAXException
390	{
391	String data = new String(ch, start, length).trim();
392	if (data.length() > 0 ) {
393	current_contents_ += data;
394	}
395	}
396
397	protected String appendPathLink(String path, String qName, Attributes atts)
398	{
399
400	path = path + "/"+qName;
401	if (atts.getLength()>0) {
402	// was gs2:id, changed to gs2:docOID --kjdon
403	String id = atts.getValue("gs2:docOID");
404	if (id != null) {
405	path += "[@gs2:docOID='"+id+"']";
406	}
407	else {
408	// is this ever used? not in perl currently
409	id = atts.getValue("gs3:id");
410	if (id != null) {
411	path += "[@gs3:id='"+id+"']";
412	}
413	}
414	}
415	return path;
416	}
417
418	protected String removePathLink(String path)
419	{
420
421	int i=path.lastIndexOf('/');
422	if (i==-1) {
423	path="";
424	} else {
425	path = path.substring(0, i);
426	}
427	return path;
428	}
429
430
431	/** these are what we save on the stack */
432	private class MyDocument
433	{
434	public Document doc = null;
435	public String contents = null;
436	public String tagname = "";
437
438	}
439
440
441	protected void pushOnStack()
442	{
443	if (current_doc_ != null) {
444	MyDocument save = new MyDocument();
445	save.doc = current_doc_;
446	save.contents = current_contents_;
447	save.tagname = current_node_;
448	stack_.push(save);
449	}
450	current_doc_ = new Document();
451	current_contents_ = "";
452	current_node_ = "";
453	}
454
455	protected void popOffStack()
456	{
457	if (!stack_.empty()) {
458	MyDocument saved = (MyDocument)stack_.pop();
459	current_doc_ = saved.doc;
460	current_contents_ = saved.contents;
461	current_node_ = saved.tagname;
462	} else {
463	current_doc_ = new Document();
464	current_contents_ = "";
465	current_node_ = "";
466	}
467	}
468
469
470	protected void deleteDocument(String doc_id)
471	throws IOException
472	{
473	debug("GS2LuceneDelete.deleteDocument(" + doc_id + ")");
474	debug("- Initial number of documents in index: " + writer_.docCount());
475	writer_.deleteDocuments(new Term("docOID", doc_id));
476	debug("- Final number of documents in index: " + writer_.docCount());
477	}
478
479
480	}
481	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: