Context Navigation

source: indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java@ 19861

Last change on this file since 19861 was 19861, checked in by kjdon, 15 years ago
fixed a bug where the last word on each line was not searchable. When reading in the lines it was concatenating them together without any space in between so the last word was gettign stuck with the first word. Have added a space between each line of text.
Property svn:keywords set to `Author Date Id Revision`
File size: 11.7 KB

Line
1	/**********************************************************************
2	*
3	* GS2LuceneIndexer.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26
27	package org.greenstone.LuceneWrapper;
28
29
30	import java.io.*;
31	import java.util.Vector;
32
33	import org.xml.sax.Attributes;
34	import org.xml.sax.helpers.DefaultHandler;
35	import org.xml.sax.InputSource;
36	import org.xml.sax.SAXException;
37	import org.xml.sax.XMLReader;
38
39	import javax.xml.parsers.SAXParser;
40	import javax.xml.parsers.SAXParserFactory;
41
42	import org.apache.lucene.document.Document;
43	import org.apache.lucene.document.Field;
44	import org.apache.lucene.index.IndexWriter;
45	import org.apache.lucene.index.Term;
46	import org.apache.lucene.analysis.Analyzer;
47
48	import java.util.Stack;
49	import java.io.FileInputStream;
50	import java.io.File;
51	import java.io.StringReader;
52	import java.net.URL;
53
54
55	/**
56	* class for indexing XML generated by lucenebuildproc.pm
57	*/
58
59	public class GS2LuceneIndexer {
60
61	protected static boolean debug = false;
62
63	protected static void debug(String message)
64	{
65	if (debug) {
66	System.err.println(message);
67	}
68	}
69
70
71	public static void main (String args[]) throws Exception
72	{
73	int verbosity = 1;
74	// Default is to edit the existing index
75	boolean create_new_index = false;
76
77	Vector filtered_args = new Vector();
78
79	int argc = args.length;
80	int i = 0;
81	while (i<argc) {
82	if (args[i].startsWith("-")) {
83
84	// -removeold causes the existing index to be overwritten
85	if (args[i].equals("-removeold")) {
86	create_new_index = true;
87	}
88
89	// -verbosity [num]
90	else if (args[i].equals("-verbosity")) {
91	i++;
92	if (i<argc) {
93	verbosity = Integer.parseInt(args[i]);
94	if (verbosity>=5) {
95	debug = true;
96	}
97	}
98	}
99	else if (args[i].equals("-debug")) {
100	debug = true;
101	}
102	else {
103	System.out.println("Unrecognised option: " + args[i]);
104	}
105	}
106	else {
107	filtered_args.add((Object)args[i]);
108	}
109	i++;
110	}
111
112	if (filtered_args.size() != 3) {
113	System.out.println("Usage: java GS2LuceneIndexer [-removeold\|-verbosity [num]] doc-tag-level building_dir index");
114	return;
115	}
116
117	String doc_tag_level = (String)filtered_args.get(0);
118	String building_dirname = (String)filtered_args.get(1);
119	String index_dirname = (String)filtered_args.get(2);
120
121	String import_dirname = building_dirname + File.separator + "text";
122
123	File import_dir = new File(import_dirname);
124	File building_dir = new File(building_dirname);
125
126	if (!import_dir.exists()) {
127	System.out.println("Couldn't find import directory: "+import_dirname);
128	return;
129	}
130
131	File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator);
132	idx_dir.mkdir();
133
134	// Set up indexer
135	Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index);
136
137	// Read from stdin the files to process
138	try {
139	InputStreamReader isr = new InputStreamReader(System.in, "UTF-8");
140	BufferedReader brin = new BufferedReader(isr);
141
142	StringBuffer xml_text = new StringBuffer(1024);
143	String line = null;
144	while ((line = brin.readLine()) != null) {
145	xml_text.append(line);
146	xml_text.append(" ");
147
148	debug("Got line " + line);
149
150	if (line.endsWith("</Delete>")) {
151
152	indexer.delete(xml_text.toString());
153	xml_text = new StringBuffer(1024);
154	}
155	else if (line.startsWith("</Doc>")) {
156	indexer.index(xml_text.toString());
157	xml_text = new StringBuffer(1024);
158	}
159	}
160
161	brin.close();
162	isr.close();
163
164	} catch (IOException e) {
165	System.err.println("Error: unable to read from stdin");
166	e.printStackTrace();
167	}
168
169	indexer.finish();
170	}
171
172
173	static public class Indexer extends DefaultHandler
174	{
175	IndexWriter writer_ = null;
176	Analyzer analyzer_ = null;
177	SAXParser sax_parser_ = null;
178	String doc_tag_level_ = null;
179
180	Stack stack_ = null;
181	String path_ = "";
182
183	Document current_doc_ = null;
184	String current_node_ = "";
185	String current_doc_oid_ = "";
186	String indexable_current_node_ = "";
187	String current_contents_ = "";
188
189	String mode_ = "";
190	protected String file_id_ = null;
191
192	static private String[] stop_words = GS2Analyzer.STOP_WORDS;
193
194
195	/** pass in true if want to create a new index, false if want to use the existing one */
196	public Indexer (String doc_tag_level, File index_dir, boolean create)
197	{
198	doc_tag_level_ = doc_tag_level;
199
200	try {
201	stack_ = new Stack();
202	SAXParserFactory sax_factory = SAXParserFactory.newInstance();
203	sax_parser_ = sax_factory.newSAXParser();
204
205	XMLReader reader = sax_parser_.getXMLReader();
206	reader.setFeature("http://xml.org/sax/features/validation", false);
207
208	analyzer_ = new GS2Analyzer(stop_words);
209
210	writer_ = new IndexWriter(index_dir.getPath(), analyzer_, create);
211	// by default, will only index 10,000 words per document
212	// Can throw out_of_memory errors
213	writer_.setMaxFieldLength(Integer.MAX_VALUE);
214	if (create) {
215	writer_.optimize();
216	}
217	}
218	catch (Exception e) {
219	// We need to know if creating/opening the index fails
220	e.printStackTrace();
221	}
222	}
223
224	/** index one document */
225	public void index (String file_id, File file)
226	{
227	mode_ = "add";
228	file_id_ = file_id;
229	path_ = "";
230	String base_path = file.getPath();
231	base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
232
233	try {
234	sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
235	}
236	catch (Exception e) {
237	println("parse error:");
238	e.printStackTrace();
239	}
240	}
241
242	/** index one document stored as string*/
243	public void index (String xml_text)
244	{
245	mode_ = "add";
246	file_id_ = "<xml doc on stdin>";
247	path_ = "";
248
249	try {
250	sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
251	}
252	catch (Exception e) {
253	println("parse error:");
254	e.printStackTrace();
255	}
256	}
257
258	/** delete one document, based on doc_id in <Delete>doc_id</Delete> */
259	public void delete(String xml_text)
260	{
261	mode_ = "delete";
262	file_id_ = "<delete doc>";
263	path_ = "";
264
265	try {
266	sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
267	}
268	catch (Exception e) {
269	println("parse error:");
270	e.printStackTrace();
271	}
272	}
273
274	public void finish()
275	{
276	/** optimise the index */
277	try {
278	writer_.optimize();
279	writer_.close();
280	}
281	catch (Exception e) {
282	}
283	}
284
285	protected void print(String s)
286	{
287	System.out.print(s);
288	}
289
290	protected void println(String s)
291	{
292	System.out.println(s);
293	}
294
295	public void startDocument() throws SAXException
296	{
297	println("Starting to process " + file_id_);
298	print("[");
299	}
300
301	public void endDocument() throws SAXException
302	{
303	println("]");
304	println("... processing finished.");
305	}
306
307	public void startElement(String uri, String localName, String qName, Attributes atts)
308	throws SAXException
309	{
310	path_ = appendPathLink(path_, qName, atts);
311
312	if (qName.equals(doc_tag_level_)) {
313	mode_ = atts.getValue("gs2:mode");
314
315	pushOnStack(); // start new doc
316	current_node_ = qName;
317
318	String node_id = atts.getValue("gs2:id");
319	print(" " + qName + ": " + node_id + " (" + mode_ + ")" );
320	current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED));
321
322	current_doc_oid_ = atts.getValue("gs2:docOID");
323	current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.UN_TOKENIZED));
324	}
325
326	if (isIndexable(atts)) {
327	indexable_current_node_ = qName;
328	}
329	else {
330	indexable_current_node_ = "";
331	}
332	}
333
334	public static boolean isIndexable(Attributes atts)
335	{
336	boolean is_indexable = false;
337
338	String index = atts.getValue("index");
339	if (index!=null) {
340	if (index.equals("1")) {
341	is_indexable = true;
342	}
343	}
344	return is_indexable;
345	}
346
347	public void endElement(String uri, String localName, String qName) throws SAXException
348	{
349	if (mode_.equals("delete")) {
350	try {
351	deleteDocument(current_doc_oid_);
352	}
353	catch (java.io.IOException e) {
354	e.printStackTrace();
355	}
356	}
357	else if (mode_.equals("add") \|\| mode_.equals("update")) {
358	if (qName.equals(indexable_current_node_))
359	{
360	current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
361	// The byXX fields are used for sorting search results
362	// We don't want to do that for Text or AllFields fields
363	// They need to be untokenised for sorting
364	if (!qName.equals("TX") && !qName.equals("ZZ"))
365	{
366	current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO));
367	}
368
369	current_contents_ = "";
370	}
371
372	if (qName.equals(doc_tag_level_)) {
373	try {
374	// perhaps this is more efficient if addDocument()
375	// used for "add" and updateDocument() for "update"
376	writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_, analyzer_);
377	}
378	catch (java.io.IOException e) {
379	e.printStackTrace();
380	}
381	popOffStack(); // end document
382	}
383
384	path_ = removePathLink(path_);
385	}
386	}
387
388	public void characters(char ch[], int start, int length) throws SAXException
389	{
390	String data = new String(ch, start, length).trim();
391	if (data.length() > 0 ) {
392	current_contents_ += data;
393	}
394	}
395
396	protected String appendPathLink(String path, String qName, Attributes atts)
397	{
398
399	path = path + "/"+qName;
400	if (atts.getLength()>0) {
401	String id = atts.getValue("gs2:id");
402	if (id != null) {
403	path += "[@gs2:id='"+id+"']";
404	}
405	else {
406	id = atts.getValue("gs3:id");
407	if (id != null) {
408	path += "[@gs3:id='"+id+"']";
409	}
410	}
411	}
412	return path;
413	}
414
415	protected String removePathLink(String path)
416	{
417
418	int i=path.lastIndexOf('/');
419	if (i==-1) {
420	path="";
421	} else {
422	path = path.substring(0, i);
423	}
424	return path;
425	}
426
427
428	/** these are what we save on the stack */
429	private class MyDocument
430	{
431	public Document doc = null;
432	public String contents = null;
433	public String tagname = "";
434
435	}
436
437
438	protected void pushOnStack()
439	{
440	if (current_doc_ != null) {
441	MyDocument save = new MyDocument();
442	save.doc = current_doc_;
443	save.contents = current_contents_;
444	save.tagname = current_node_;
445	stack_.push(save);
446	}
447	current_doc_ = new Document();
448	current_contents_ = "";
449	current_node_ = "";
450	}
451
452	protected void popOffStack()
453	{
454	if (!stack_.empty()) {
455	MyDocument saved = (MyDocument)stack_.pop();
456	current_doc_ = saved.doc;
457	current_contents_ = saved.contents;
458	current_node_ = saved.tagname;
459	} else {
460	current_doc_ = new Document();
461	current_contents_ = "";
462	current_node_ = "";
463	}
464	}
465
466
467	protected void deleteDocument(String doc_id)
468	throws IOException
469	{
470	debug("GS2LuceneDelete.deleteDocument(" + doc_id + ")");
471	debug("- Initial number of documents in index: " + writer_.docCount());
472	writer_.deleteDocuments(new Term("docOID", doc_id));
473	debug("- Final number of documents in index: " + writer_.docCount());
474	}
475
476
477	}
478	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: