Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: gs3-extensions/solr/trunk/src/src/java/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java@ 24641

Last change on this file since 24641 was 24641, checked in by davidb, 13 years ago
Initial cut at Greenstone3 runtime code to support Solr. Solr code based on version 3.3, so this also include an upgraded version of the LuceneWrapper code (gs2build/common-src/indexers/lucene-gs) that works with this version of the support jar files
Property svn:executable set to ``*
File size: 12.1 KB

Line
1	/**********************************************************************
2	*
3	* GS2LuceneIndexer.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26
27	package org.greenstone.LuceneWrapper;
28
29
30	import java.io.*;
31	import java.util.Vector;
32
33	import org.xml.sax.Attributes;
34	import org.xml.sax.helpers.DefaultHandler;
35	import org.xml.sax.InputSource;
36	import org.xml.sax.SAXException;
37	import org.xml.sax.XMLReader;
38
39	import javax.xml.parsers.SAXParser;
40	import javax.xml.parsers.SAXParserFactory;
41
42	import org.apache.lucene.document.Document;
43	import org.apache.lucene.document.Field;
44	import org.apache.lucene.index.IndexWriter;
45	import org.apache.lucene.index.Term;
46	import org.apache.lucene.analysis.Analyzer;
47
48	import org.apache.lucene.store.SimpleFSDirectory;
49	import org.apache.lucene.index.IndexWriter.MaxFieldLength;
50
51	import java.util.Stack;
52	import java.io.FileInputStream;
53	import java.io.File;
54	import java.io.StringReader;
55	import java.net.URL;
56
57
58	/**
59	* class for indexing XML generated by lucenebuildproc.pm
60	*/
61
62	public class GS2LuceneIndexer {
63
64	protected static boolean debug = false;
65
66	protected static void debug(String message)
67	{
68	if (debug) {
69	System.err.println(message);
70	}
71	}
72
73
74	public static void main (String args[]) throws Exception
75	{
76	int verbosity = 1;
77	// Default is to edit the existing index
78	boolean create_new_index = false;
79
80	Vector filtered_args = new Vector();
81
82	int argc = args.length;
83	int i = 0;
84	while (i<argc) {
85	if (args[i].startsWith("-")) {
86
87	// -removeold causes the existing index to be overwritten
88	if (args[i].equals("-removeold")) {
89	create_new_index = true;
90	}
91
92	// -verbosity [num]
93	else if (args[i].equals("-verbosity")) {
94	i++;
95	if (i<argc) {
96	verbosity = Integer.parseInt(args[i]);
97	if (verbosity>=5) {
98	debug = true;
99	}
100	}
101	}
102	else if (args[i].equals("-debug")) {
103	debug = true;
104	}
105	else {
106	System.err.println("Unrecognised option: " + args[i]);
107	}
108	}
109	else {
110	filtered_args.add((Object)args[i]);
111	}
112	i++;
113	}
114
115	if (filtered_args.size() != 3) {
116	System.err.println("Usage: java GS2LuceneIndexer [-removeold\|-verbosity [num]] doc-tag-level building_dir index");
117	return;
118	}
119
120	String doc_tag_level = (String)filtered_args.get(0);
121	String building_dirname = (String)filtered_args.get(1);
122	String index_dirname = (String)filtered_args.get(2);
123
124	String import_dirname = building_dirname + File.separator + "text";
125
126	File import_dir = new File(import_dirname);
127	File building_dir = new File(building_dirname);
128
129	if (!import_dir.exists()) {
130	System.err.println("Couldn't find import directory: "+import_dirname);
131	return;
132	}
133
134	File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator);
135	idx_dir.mkdir();
136
137	// Set up indexer
138	Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index);
139
140	// Read from stdin the files to process
141	try {
142	InputStreamReader isr = new InputStreamReader(System.in, "UTF-8");
143	BufferedReader brin = new BufferedReader(isr);
144
145	StringBuffer xml_text = new StringBuffer(1024);
146	String line = null;
147	while ((line = brin.readLine()) != null) {
148	xml_text.append(line);
149	xml_text.append(" ");
150
151	debug("Got line " + line);
152
153	if (line.endsWith("</Delete>")) {
154
155	indexer.delete(xml_text.toString());
156	xml_text = new StringBuffer(1024);
157	}
158	else if (line.startsWith("</Doc>")) {
159	indexer.index(xml_text.toString());
160	xml_text = new StringBuffer(1024);
161	}
162	}
163
164	brin.close();
165	isr.close();
166
167	} catch (IOException e) {
168	System.err.println("Error: unable to read from stdin");
169	e.printStackTrace();
170	}
171
172	indexer.finish();
173	}
174
175
176	static public class Indexer extends DefaultHandler
177	{
178	IndexWriter writer_ = null;
179	Analyzer analyzer_ = null;
180	SAXParser sax_parser_ = null;
181	String doc_tag_level_ = null;
182
183	Stack stack_ = null;
184	String path_ = "";
185
186	Document current_doc_ = null;
187	String current_node_ = "";
188	String current_doc_oid_ = "";
189	String indexable_current_node_ = "";
190	String current_contents_ = "";
191
192	String mode_ = "";
193	protected String file_id_ = null;
194
195	/** pass in true if want to create a new index, false if want to use the existing one */
196	public Indexer (String doc_tag_level, File index_dir, boolean create)
197	{
198	doc_tag_level_ = doc_tag_level;
199
200	try {
201	stack_ = new Stack();
202	SAXParserFactory sax_factory = SAXParserFactory.newInstance();
203	sax_parser_ = sax_factory.newSAXParser();
204
205	XMLReader reader = sax_parser_.getXMLReader();
206	reader.setFeature("http://xml.org/sax/features/validation", false);
207
208	SimpleFSDirectory index_dir_dir = new SimpleFSDirectory(new File(index_dir.getPath()));
209
210	analyzer_ = new GS2Analyzer(); // uses build in stop_word_set
211
212	writer_ = new IndexWriter(index_dir_dir, analyzer_, create, MaxFieldLength.UNLIMITED);
213
214	// by default, will only index 10,000 words per document
215	// Can throw out_of_memory errors
216	writer_.setMaxFieldLength(Integer.MAX_VALUE);
217	if (create) {
218	writer_.optimize();
219	}
220	}
221	catch (Exception e) {
222	// We need to know if creating/opening the index fails
223	e.printStackTrace();
224	}
225	}
226
227	/** index one document */
228	public void index (String file_id, File file)
229	{
230	mode_ = "add";
231	file_id_ = file_id;
232	path_ = "";
233	String base_path = file.getPath();
234	base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
235
236	try {
237	sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
238	}
239	catch (Exception e) {
240	println("parse error:");
241	e.printStackTrace();
242	}
243	}
244
245	/** index one document stored as string*/
246	public void index (String xml_text)
247	{
248	mode_ = "add";
249	file_id_ = "<xml doc on stdin>";
250	path_ = "";
251
252	try {
253	sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
254	}
255	catch (Exception e) {
256	println("parse error:");
257	e.printStackTrace();
258	}
259	}
260
261	/** delete one document, based on doc_id in <Delete>doc_id</Delete> */
262	public void delete(String xml_text)
263	{
264	mode_ = "delete";
265	file_id_ = "<delete doc>";
266	path_ = "";
267
268	try {
269	sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
270	}
271	catch (Exception e) {
272	println("parse error:");
273	e.printStackTrace();
274	}
275	}
276
277	public void finish()
278	{
279	/** optimise the index */
280	try {
281	writer_.optimize();
282	writer_.close();
283	}
284	catch (Exception e) {
285	}
286	}
287
288	protected void print(String s)
289	{
290	System.err.print(s);
291	}
292
293	protected void println(String s)
294	{
295	System.err.println(s);
296	}
297
298	public void startDocument() throws SAXException
299	{
300	println("Starting to process " + file_id_);
301	print("[");
302	}
303
304	public void endDocument() throws SAXException
305	{
306	println("]");
307	println("... processing finished.");
308	}
309
310	public void startElement(String uri, String localName, String qName, Attributes atts)
311	throws SAXException
312	{
313	path_ = appendPathLink(path_, qName, atts);
314
315	if (qName.equals(doc_tag_level_)) {
316	mode_ = atts.getValue("gs2:mode");
317
318	pushOnStack(); // start new doc
319	current_node_ = qName;
320
321	//String node_id = atts.getValue("gs2:id");
322	//print(" " + qName + ": " + node_id + " (" + mode_ + ")" );
323	//current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.NOT_ANALYZED));
324
325	current_doc_oid_ = atts.getValue("gs2:docOID");
326	print(" " + qName + ": " + current_doc_oid_ + " (" + mode_ + ")" );
327	current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.NOT_ANALYZED));
328	}
329
330	if (isIndexable(atts)) {
331	indexable_current_node_ = qName;
332	}
333	else {
334	indexable_current_node_ = "";
335	}
336	}
337
338	public static boolean isIndexable(Attributes atts)
339	{
340	boolean is_indexable = false;
341
342	String index = atts.getValue("index");
343	if (index!=null) {
344	if (index.equals("1")) {
345	is_indexable = true;
346	}
347	}
348	return is_indexable;
349	}
350
351	public void endElement(String uri, String localName, String qName) throws SAXException
352	{
353	if (mode_.equals("delete")) {
354	try {
355	deleteDocument(current_doc_oid_);
356	}
357	catch (java.io.IOException e) {
358	e.printStackTrace();
359	}
360	}
361	else if (mode_.equals("add") \|\| mode_.equals("update")) {
362	if (qName.equals(indexable_current_node_))
363	{
364	current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES));
365	// The byXX fields are used for sorting search results
366	// We don't want to do that for Text or AllFields fields
367	// They need to be untokenised for sorting
368	if (!qName.equals("TX") && !qName.equals("ZZ"))
369	{
370	current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.NO));
371	}
372
373	current_contents_ = "";
374	}
375
376	if (qName.equals(doc_tag_level_)) {
377	try {
378	// perhaps this is more efficient if addDocument()
379	// used for "add" and updateDocument() for "update"
380	writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_, analyzer_);
381	}
382	catch (java.io.IOException e) {
383	e.printStackTrace();
384	}
385	popOffStack(); // end document
386	}
387
388	path_ = removePathLink(path_);
389	}
390	}
391
392	public void characters(char ch[], int start, int length) throws SAXException
393	{
394	String data = new String(ch, start, length).trim();
395	if (data.length() > 0 ) {
396	current_contents_ += data;
397	}
398	}
399
400	protected String appendPathLink(String path, String qName, Attributes atts)
401	{
402
403	path = path + "/"+qName;
404	if (atts.getLength()>0) {
405	// was gs2:id, changed to gs2:docOID --kjdon
406	String id = atts.getValue("gs2:docOID");
407	if (id != null) {
408	path += "[@gs2:docOID='"+id+"']";
409	}
410	else {
411	// is this ever used? not in perl currently
412	id = atts.getValue("gs3:id");
413	if (id != null) {
414	path += "[@gs3:id='"+id+"']";
415	}
416	}
417	}
418	return path;
419	}
420
421	protected String removePathLink(String path)
422	{
423
424	int i=path.lastIndexOf('/');
425	if (i==-1) {
426	path="";
427	} else {
428	path = path.substring(0, i);
429	}
430	return path;
431	}
432
433
434	/** these are what we save on the stack */
435	private class MyDocument
436	{
437	public Document doc = null;
438	public String contents = null;
439	public String tagname = "";
440
441	}
442
443
444	protected void pushOnStack()
445	{
446	if (current_doc_ != null) {
447	MyDocument save = new MyDocument();
448	save.doc = current_doc_;
449	save.contents = current_contents_;
450	save.tagname = current_node_;
451	stack_.push(save);
452	}
453	current_doc_ = new Document();
454	current_contents_ = "";
455	current_node_ = "";
456	}
457
458	protected void popOffStack()
459	{
460	if (!stack_.empty()) {
461	MyDocument saved = (MyDocument)stack_.pop();
462	current_doc_ = saved.doc;
463	current_contents_ = saved.contents;
464	current_node_ = saved.tagname;
465	} else {
466	current_doc_ = new Document();
467	current_contents_ = "";
468	current_node_ = "";
469	}
470	}
471
472
473	protected void deleteDocument(String doc_id)
474	throws IOException
475	{
476	debug("GS2LuceneDelete.deleteDocument(" + doc_id + ")");
477	debug("- Initial number of documents in index: " + writer_.numDocs());
478	writer_.deleteDocuments(new Term("docOID", doc_id));
479	debug("- Final number of documents in index: " + writer_.numDocs());
480	}
481
482
483	}
484	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: