Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper4/GS2LuceneIndexer.java@ 29148

Last change on this file since 29148 was 29148, checked in by ak19, 10 years ago
Part of port from lucene3.3.0 to lucene4.7.2. Related to LuceneWrapper. 1. Updating the lucene-gs makefiles to allow compiling up Lucene4Wrapper.jar or Lucene3Wrapper.jar. Only the Linux Makefile.in has been tested so far. 2. Adding in the jar files necessary for Lucene4Wrapper into the lib folder's new lucene4 subfolder. 3. Updating the Lucene src code to use lucene4.7.2 instead of lucene3.3.0.
Property svn:executable set to ``*
File size: 12.2 KB

Line
1	/**********************************************************************
2	*
3	* GS2LuceneIndexer.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26
27	package org.greenstone.LuceneWrapper4;
28
29
30	import java.io.*;
31	import java.util.Vector;
32
33	import org.xml.sax.Attributes;
34	import org.xml.sax.helpers.DefaultHandler;
35	import org.xml.sax.InputSource;
36	import org.xml.sax.SAXException;
37	import org.xml.sax.XMLReader;
38
39	import javax.xml.parsers.SAXParser;
40	import javax.xml.parsers.SAXParserFactory;
41
42	import org.apache.lucene.document.Document;
43	import org.apache.lucene.document.Field;
44	import org.apache.lucene.index.IndexWriter;
45	import org.apache.lucene.index.Term;
46	import org.apache.lucene.analysis.Analyzer;
47
48
49	import org.apache.lucene.util.Version;
50
51	import java.util.Stack;
52	import java.io.FileInputStream;
53	import java.io.File;
54	import java.io.StringReader;
55	import java.net.URL;
56
57
58	/**
59	* class for indexing XML generated by lucenebuildproc.pm
60	*/
61
62	public class GS2LuceneIndexer {
63
64	protected static boolean debug = false;
65
66	protected static void debug(String message)
67	{
68	if (debug) {
69	System.err.println(message);
70	}
71	}
72
73
74	public static void main (String args[]) throws Exception
75	{
76	int verbosity = 1;
77	// Default is to edit the existing index
78	boolean create_new_index = false;
79
80	Vector filtered_args = new Vector();
81
82	int argc = args.length;
83	int i = 0;
84	while (i<argc) {
85	if (args[i].startsWith("-")) {
86
87	// -removeold causes the existing index to be overwritten
88	if (args[i].equals("-removeold")) {
89	create_new_index = true;
90	}
91
92	// -verbosity [num]
93	else if (args[i].equals("-verbosity")) {
94	i++;
95	if (i<argc) {
96	verbosity = Integer.parseInt(args[i]);
97	if (verbosity>=5) {
98	debug = true;
99	}
100	}
101	}
102	else if (args[i].equals("-debug")) {
103	debug = true;
104	}
105	else {
106	System.err.println("Unrecognised option: " + args[i]);
107	}
108	}
109	else {
110	filtered_args.add((Object)args[i]);
111	}
112	i++;
113	}
114
115	if (filtered_args.size() != 3) {
116	System.err.println("Usage: java GS2LuceneIndexer [-removeold\|-verbosity [num]] doc-tag-level building_dir index");
117	return;
118	}
119
120	String doc_tag_level = (String)filtered_args.get(0);
121	String building_dirname = (String)filtered_args.get(1);
122	String index_dirname = (String)filtered_args.get(2);
123
124	String import_dirname = building_dirname + File.separator + "text";
125
126	File import_dir = new File(import_dirname);
127	File building_dir = new File(building_dirname);
128
129	if (!import_dir.exists()) {
130	System.err.println("Couldn't find import directory: "+import_dirname);
131	return;
132	}
133
134	File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator);
135	idx_dir.mkdir();
136
137	// Set up indexer
138	Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index);
139
140	// Read from stdin the files to process
141	try {
142	InputStreamReader isr = new InputStreamReader(System.in, "UTF-8");
143	BufferedReader brin = new BufferedReader(isr);
144
145	StringBuffer xml_text = new StringBuffer(1024);
146	String line = null;
147	while ((line = brin.readLine()) != null) {
148	xml_text.append(line);
149	xml_text.append(" ");
150
151	debug("Got line " + line);
152
153	if (line.endsWith("</Delete>")) {
154
155	indexer.delete(xml_text.toString());
156	xml_text = new StringBuffer(1024);
157	}
158	else if (line.startsWith("</Doc>")) {
159	indexer.index(xml_text.toString());
160	xml_text = new StringBuffer(1024);
161	}
162	}
163
164	brin.close();
165	isr.close();
166
167	} catch (IOException e) {
168	System.err.println("Error: unable to read from stdin");
169	e.printStackTrace();
170	}
171
172	indexer.finish();
173	}
174
175
176	static public class Indexer extends DefaultHandler
177	{
178	IndexWriter writer_ = null;
179	Analyzer analyzer_ = null;
180	SAXParser sax_parser_ = null;
181	String doc_tag_level_ = null;
182
183	Stack stack_ = null;
184	String path_ = "";
185
186	Document current_doc_ = null;
187	String current_node_ = "";
188	String current_doc_oid_ = "";
189	String indexable_current_node_ = "";
190	boolean tokenize = true;
191	String current_contents_ = "";
192
193	String mode_ = "";
194	protected String file_id_ = null;
195
196	/** pass in true if want to create a new index, false if want to use the existing one */
197	public Indexer (String doc_tag_level, File index_dir, boolean create)
198	{
199	doc_tag_level_ = doc_tag_level;
200
201	try {
202	stack_ = new Stack();
203	SAXParserFactory sax_factory = SAXParserFactory.newInstance();
204	sax_parser_ = sax_factory.newSAXParser();
205
206	XMLReader reader = sax_parser_.getXMLReader();
207	reader.setFeature("http://xml.org/sax/features/validation", false);
208
209	analyzer_ = new GS2Analyzer(); // uses build in stop_word_set
210	writer_ = GSLuceneUtil.getIndexWriter(index_dir.getPath(), analyzer_, create);
211	}
212	catch (Exception e) {
213	// We need to know if creating/opening the index fails
214	e.printStackTrace();
215	}
216	}
217
218	/** index one document */
219	public void index (String file_id, File file)
220	{
221	mode_ = "add";
222	file_id_ = file_id;
223	path_ = "";
224	String base_path = file.getPath();
225	base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
226
227	try {
228	sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
229	}
230	catch (Exception e) {
231	println("parse error:");
232	e.printStackTrace();
233	}
234	}
235
236	/** index one document stored as string*/
237	public void index (String xml_text)
238	{
239	mode_ = "add";
240	file_id_ = "<xml doc on stdin>";
241	path_ = "";
242
243	try {
244	sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
245	}
246	catch (Exception e) {
247	println("parse error:");
248	e.printStackTrace();
249	}
250	}
251
252	/** delete one document, based on doc_id in <Delete>doc_id</Delete> */
253	public void delete(String xml_text)
254	{
255	mode_ = "delete";
256	file_id_ = "<delete doc>";
257	path_ = "";
258
259	try {
260	sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
261	}
262	catch (Exception e) {
263	println("parse error:");
264	e.printStackTrace();
265	}
266	}
267
268	public void finish()
269	{
270	/** optimise the index */
271	try {
272	// writer_.optimize(); // now deprecated
273	writer_.close();
274	}
275	catch (Exception e) {
276	e.printStackTrace();
277	}
278	}
279
280	protected void print(String s)
281	{
282	System.err.print(s);
283	}
284
285	protected void println(String s)
286	{
287	System.err.println(s);
288	}
289
290	public void startDocument() throws SAXException
291	{
292	println("Starting to process " + file_id_);
293	print("[");
294	}
295
296	public void endDocument() throws SAXException
297	{
298	println("]");
299	println("... processing finished.");
300	}
301
302	public void startElement(String uri, String localName, String qName, Attributes atts)
303	throws SAXException
304	{
305	path_ = appendPathLink(path_, qName, atts);
306
307	if (qName.equals(doc_tag_level_)) {
308	mode_ = atts.getValue("gs2:mode");
309
310	pushOnStack(); // start new doc
311	current_node_ = qName;
312
313	//String node_id = atts.getValue("gs2:id");
314	//print(" " + qName + ": " + node_id + " (" + mode_ + ")" );
315	//current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.NOT_ANALYZED));
316
317	current_doc_oid_ = atts.getValue("gs2:docOID");
318	print(" " + qName + ": " + current_doc_oid_ + " (" + mode_ + ")" );
319	current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.NOT_ANALYZED));
320	}
321
322	if (isIndexable(atts)) {
323	indexable_current_node_ = qName;
324	if (isTokenized(atts)) {
325	tokenize = true;
326	} else {
327	tokenize = false;
328	}
329	}
330	else {
331	indexable_current_node_ = "";
332	}
333	}
334
335	public static boolean isTokenized(Attributes atts) {
336	boolean tokenize = true;
337	String tok = atts.getValue("tokenize");
338	if (tok!=null && tok.equals("0")) {
339	tokenize = false;
340	}
341	return tokenize;
342	}
343
344	public static boolean isIndexable(Attributes atts)
345	{
346	boolean is_indexable = false;
347
348	String index = atts.getValue("index");
349	if (index!=null) {
350	if (index.equals("1")) {
351	is_indexable = true;
352	}
353	}
354	return is_indexable;
355	}
356
357	public void endElement(String uri, String localName, String qName) throws SAXException
358	{
359	if (mode_.equals("delete")) {
360	try {
361	deleteDocument(current_doc_oid_);
362	}
363	catch (java.io.IOException e) {
364	e.printStackTrace();
365	}
366	}
367	else if (mode_.equals("add") \|\| mode_.equals("update")) {
368	if (qName.equals(indexable_current_node_))
369	{
370	if (tokenize) {
371	current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES));
372	} else {
373	current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.NO));
374	}
375	// // The byXX fields are used for sorting search results
376	// // We don't want to do that for Text or AllFields fields
377	// // They need to be untokenised for sorting
378	// if (!qName.equals("TX") && !qName.equals("ZZ"))
379	// {
380	// current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.NO));
381	// }
382
383	current_contents_ = "";
384	}
385
386	if (qName.equals(doc_tag_level_)) {
387	try {
388	// perhaps this is more efficient if addDocument()
389	// used for "add" and updateDocument() for "update"
390	writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_, analyzer_);
391	}
392	catch (java.io.IOException e) {
393	e.printStackTrace();
394	}
395	popOffStack(); // end document
396	}
397
398	path_ = removePathLink(path_);
399	}
400	}
401
402	public void characters(char ch[], int start, int length) throws SAXException
403	{
404	String data = new String(ch, start, length).trim();
405	if (data.length() > 0 ) {
406	current_contents_ += data;
407	}
408	}
409
410	protected String appendPathLink(String path, String qName, Attributes atts)
411	{
412
413	path = path + "/"+qName;
414	if (atts.getLength()>0) {
415	// was gs2:id, changed to gs2:docOID --kjdon
416	String id = atts.getValue("gs2:docOID");
417	if (id != null) {
418	path += "[@gs2:docOID='"+id+"']";
419	}
420	else {
421	// is this ever used? not in perl currently
422	id = atts.getValue("gs3:id");
423	if (id != null) {
424	path += "[@gs3:id='"+id+"']";
425	}
426	}
427	}
428	return path;
429	}
430
431	protected String removePathLink(String path)
432	{
433
434	int i=path.lastIndexOf('/');
435	if (i==-1) {
436	path="";
437	} else {
438	path = path.substring(0, i);
439	}
440	return path;
441	}
442
443
444	/** these are what we save on the stack */
445	private class MyDocument
446	{
447	public Document doc = null;
448	public String contents = null;
449	public String tagname = "";
450
451	}
452
453
454	protected void pushOnStack()
455	{
456	if (current_doc_ != null) {
457	MyDocument save = new MyDocument();
458	save.doc = current_doc_;
459	save.contents = current_contents_;
460	save.tagname = current_node_;
461	stack_.push(save);
462	}
463	current_doc_ = new Document();
464	current_contents_ = "";
465	current_node_ = "";
466	}
467
468	protected void popOffStack()
469	{
470	if (!stack_.empty()) {
471	MyDocument saved = (MyDocument)stack_.pop();
472	current_doc_ = saved.doc;
473	current_contents_ = saved.contents;
474	current_node_ = saved.tagname;
475	} else {
476	current_doc_ = new Document();
477	current_contents_ = "";
478	current_node_ = "";
479	}
480	}
481
482
483	protected void deleteDocument(String doc_id)
484	throws IOException
485	{
486	debug("GS2LuceneDelete.deleteDocument(" + doc_id + ")");
487	debug("- Initial number of documents in index: " + writer_.numDocs());
488	writer_.deleteDocuments(new Term("docOID", doc_id));
489	debug("- Final number of documents in index: " + writer_.numDocs());
490	}
491
492
493	}
494	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: