Context Navigation

source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper3/GS2LuceneIndexer.java@ 27359

Last change on this file since 27359 was 27359, checked in by kjdon, 11 years ago
sort fields are now separate from index fields. index fields will be like <TI index=1> and sort fields will be like <byTI index=1 tokenize=0>
Property svn:executable set to ``*
File size: 12.6 KB

Line
1	/**********************************************************************
2	*
3	* GS2LuceneIndexer.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26
27	package org.greenstone.LuceneWrapper3;
28
29
30	import java.io.*;
31	import java.util.Vector;
32
33	import org.xml.sax.Attributes;
34	import org.xml.sax.helpers.DefaultHandler;
35	import org.xml.sax.InputSource;
36	import org.xml.sax.SAXException;
37	import org.xml.sax.XMLReader;
38
39	import javax.xml.parsers.SAXParser;
40	import javax.xml.parsers.SAXParserFactory;
41
42	import org.apache.lucene.document.Document;
43	import org.apache.lucene.document.Field;
44	import org.apache.lucene.index.IndexWriter;
45	import org.apache.lucene.index.Term;
46	import org.apache.lucene.analysis.Analyzer;
47
48	import org.apache.lucene.store.SimpleFSDirectory;
49	import org.apache.lucene.index.IndexWriter.MaxFieldLength;
50
51	import java.util.Stack;
52	import java.io.FileInputStream;
53	import java.io.File;
54	import java.io.StringReader;
55	import java.net.URL;
56
57
58	/**
59	* class for indexing XML generated by lucenebuildproc.pm
60	*/
61
62	public class GS2LuceneIndexer {
63
64	protected static boolean debug = false;
65
66	protected static void debug(String message)
67	{
68	if (debug) {
69	System.err.println(message);
70	}
71	}
72
73
74	public static void main (String args[]) throws Exception
75	{
76	int verbosity = 1;
77	// Default is to edit the existing index
78	boolean create_new_index = false;
79
80	Vector filtered_args = new Vector();
81
82	int argc = args.length;
83	int i = 0;
84	while (i<argc) {
85	if (args[i].startsWith("-")) {
86
87	// -removeold causes the existing index to be overwritten
88	if (args[i].equals("-removeold")) {
89	create_new_index = true;
90	}
91
92	// -verbosity [num]
93	else if (args[i].equals("-verbosity")) {
94	i++;
95	if (i<argc) {
96	verbosity = Integer.parseInt(args[i]);
97	if (verbosity>=5) {
98	debug = true;
99	}
100	}
101	}
102	else if (args[i].equals("-debug")) {
103	debug = true;
104	}
105	else {
106	System.err.println("Unrecognised option: " + args[i]);
107	}
108	}
109	else {
110	filtered_args.add((Object)args[i]);
111	}
112	i++;
113	}
114
115	if (filtered_args.size() != 3) {
116	System.err.println("Usage: java GS2LuceneIndexer [-removeold\|-verbosity [num]] doc-tag-level building_dir index");
117	return;
118	}
119
120	String doc_tag_level = (String)filtered_args.get(0);
121	String building_dirname = (String)filtered_args.get(1);
122	String index_dirname = (String)filtered_args.get(2);
123
124	String import_dirname = building_dirname + File.separator + "text";
125
126	File import_dir = new File(import_dirname);
127	File building_dir = new File(building_dirname);
128
129	if (!import_dir.exists()) {
130	System.err.println("Couldn't find import directory: "+import_dirname);
131	return;
132	}
133
134	File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator);
135	idx_dir.mkdir();
136
137	// Set up indexer
138	Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index);
139
140	// Read from stdin the files to process
141	try {
142	InputStreamReader isr = new InputStreamReader(System.in, "UTF-8");
143	BufferedReader brin = new BufferedReader(isr);
144
145	StringBuffer xml_text = new StringBuffer(1024);
146	String line = null;
147	while ((line = brin.readLine()) != null) {
148	xml_text.append(line);
149	xml_text.append(" ");
150
151	debug("Got line " + line);
152
153	if (line.endsWith("</Delete>")) {
154
155	indexer.delete(xml_text.toString());
156	xml_text = new StringBuffer(1024);
157	}
158	else if (line.startsWith("</Doc>")) {
159	indexer.index(xml_text.toString());
160	xml_text = new StringBuffer(1024);
161	}
162	}
163
164	brin.close();
165	isr.close();
166
167	} catch (IOException e) {
168	System.err.println("Error: unable to read from stdin");
169	e.printStackTrace();
170	}
171
172	indexer.finish();
173	}
174
175
176	static public class Indexer extends DefaultHandler
177	{
178	IndexWriter writer_ = null;
179	Analyzer analyzer_ = null;
180	SAXParser sax_parser_ = null;
181	String doc_tag_level_ = null;
182
183	Stack stack_ = null;
184	String path_ = "";
185
186	Document current_doc_ = null;
187	String current_node_ = "";
188	String current_doc_oid_ = "";
189	String indexable_current_node_ = "";
190	boolean tokenize = true;
191	String current_contents_ = "";
192
193	String mode_ = "";
194	protected String file_id_ = null;
195
196	/** pass in true if want to create a new index, false if want to use the existing one */
197	public Indexer (String doc_tag_level, File index_dir, boolean create)
198	{
199	doc_tag_level_ = doc_tag_level;
200
201	try {
202	stack_ = new Stack();
203	SAXParserFactory sax_factory = SAXParserFactory.newInstance();
204	sax_parser_ = sax_factory.newSAXParser();
205
206	XMLReader reader = sax_parser_.getXMLReader();
207	reader.setFeature("http://xml.org/sax/features/validation", false);
208
209	SimpleFSDirectory index_dir_dir = new SimpleFSDirectory(new File(index_dir.getPath()));
210
211	analyzer_ = new GS2Analyzer(); // uses build in stop_word_set
212
213	writer_ = new IndexWriter(index_dir_dir, analyzer_, create, MaxFieldLength.UNLIMITED);
214
215	// by default, will only index 10,000 words per document
216	// Can throw out_of_memory errors
217	writer_.setMaxFieldLength(Integer.MAX_VALUE);
218	if (create) {
219	writer_.optimize();
220	}
221	}
222	catch (Exception e) {
223	// We need to know if creating/opening the index fails
224	e.printStackTrace();
225	}
226	}
227
228	/** index one document */
229	public void index (String file_id, File file)
230	{
231	mode_ = "add";
232	file_id_ = file_id;
233	path_ = "";
234	String base_path = file.getPath();
235	base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
236
237	try {
238	sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
239	}
240	catch (Exception e) {
241	println("parse error:");
242	e.printStackTrace();
243	}
244	}
245
246	/** index one document stored as string*/
247	public void index (String xml_text)
248	{
249	mode_ = "add";
250	file_id_ = "<xml doc on stdin>";
251	path_ = "";
252
253	try {
254	sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
255	}
256	catch (Exception e) {
257	println("parse error:");
258	e.printStackTrace();
259	}
260	}
261
262	/** delete one document, based on doc_id in <Delete>doc_id</Delete> */
263	public void delete(String xml_text)
264	{
265	mode_ = "delete";
266	file_id_ = "<delete doc>";
267	path_ = "";
268
269	try {
270	sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
271	}
272	catch (Exception e) {
273	println("parse error:");
274	e.printStackTrace();
275	}
276	}
277
278	public void finish()
279	{
280	/** optimise the index */
281	try {
282	writer_.optimize();
283	writer_.close();
284	}
285	catch (Exception e) {
286	}
287	}
288
289	protected void print(String s)
290	{
291	System.err.print(s);
292	}
293
294	protected void println(String s)
295	{
296	System.err.println(s);
297	}
298
299	public void startDocument() throws SAXException
300	{
301	println("Starting to process " + file_id_);
302	print("[");
303	}
304
305	public void endDocument() throws SAXException
306	{
307	println("]");
308	println("... processing finished.");
309	}
310
311	public void startElement(String uri, String localName, String qName, Attributes atts)
312	throws SAXException
313	{
314	path_ = appendPathLink(path_, qName, atts);
315
316	if (qName.equals(doc_tag_level_)) {
317	mode_ = atts.getValue("gs2:mode");
318
319	pushOnStack(); // start new doc
320	current_node_ = qName;
321
322	//String node_id = atts.getValue("gs2:id");
323	//print(" " + qName + ": " + node_id + " (" + mode_ + ")" );
324	//current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.NOT_ANALYZED));
325
326	current_doc_oid_ = atts.getValue("gs2:docOID");
327	print(" " + qName + ": " + current_doc_oid_ + " (" + mode_ + ")" );
328	current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.NOT_ANALYZED));
329	}
330
331	if (isIndexable(atts)) {
332	indexable_current_node_ = qName;
333	if (isTokenized(atts)) {
334	tokenize = true;
335	} else {
336	tokenize = false;
337	}
338	}
339	else {
340	indexable_current_node_ = "";
341	}
342	}
343
344	public static boolean isTokenized(Attributes atts) {
345	boolean tokenize = true;
346	String tok = atts.getValue("tokenize");
347	if (tok!=null && tok.equals("0")) {
348	tokenize = false;
349	}
350	return tokenize;
351	}
352
353	public static boolean isIndexable(Attributes atts)
354	{
355	boolean is_indexable = false;
356
357	String index = atts.getValue("index");
358	if (index!=null) {
359	if (index.equals("1")) {
360	is_indexable = true;
361	}
362	}
363	return is_indexable;
364	}
365
366	public void endElement(String uri, String localName, String qName) throws SAXException
367	{
368	if (mode_.equals("delete")) {
369	try {
370	deleteDocument(current_doc_oid_);
371	}
372	catch (java.io.IOException e) {
373	e.printStackTrace();
374	}
375	}
376	else if (mode_.equals("add") \|\| mode_.equals("update")) {
377	if (qName.equals(indexable_current_node_))
378	{
379	if (tokenize) {
380	current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES));
381	} else {
382	current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.NO));
383	}
384	// // The byXX fields are used for sorting search results
385	// // We don't want to do that for Text or AllFields fields
386	// // They need to be untokenised for sorting
387	// if (!qName.equals("TX") && !qName.equals("ZZ"))
388	// {
389	// current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.NO));
390	// }
391
392	current_contents_ = "";
393	}
394
395	if (qName.equals(doc_tag_level_)) {
396	try {
397	// perhaps this is more efficient if addDocument()
398	// used for "add" and updateDocument() for "update"
399	writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_, analyzer_);
400	}
401	catch (java.io.IOException e) {
402	e.printStackTrace();
403	}
404	popOffStack(); // end document
405	}
406
407	path_ = removePathLink(path_);
408	}
409	}
410
411	public void characters(char ch[], int start, int length) throws SAXException
412	{
413	String data = new String(ch, start, length).trim();
414	if (data.length() > 0 ) {
415	current_contents_ += data;
416	}
417	}
418
419	protected String appendPathLink(String path, String qName, Attributes atts)
420	{
421
422	path = path + "/"+qName;
423	if (atts.getLength()>0) {
424	// was gs2:id, changed to gs2:docOID --kjdon
425	String id = atts.getValue("gs2:docOID");
426	if (id != null) {
427	path += "[@gs2:docOID='"+id+"']";
428	}
429	else {
430	// is this ever used? not in perl currently
431	id = atts.getValue("gs3:id");
432	if (id != null) {
433	path += "[@gs3:id='"+id+"']";
434	}
435	}
436	}
437	return path;
438	}
439
440	protected String removePathLink(String path)
441	{
442
443	int i=path.lastIndexOf('/');
444	if (i==-1) {
445	path="";
446	} else {
447	path = path.substring(0, i);
448	}
449	return path;
450	}
451
452
453	/** these are what we save on the stack */
454	private class MyDocument
455	{
456	public Document doc = null;
457	public String contents = null;
458	public String tagname = "";
459
460	}
461
462
463	protected void pushOnStack()
464	{
465	if (current_doc_ != null) {
466	MyDocument save = new MyDocument();
467	save.doc = current_doc_;
468	save.contents = current_contents_;
469	save.tagname = current_node_;
470	stack_.push(save);
471	}
472	current_doc_ = new Document();
473	current_contents_ = "";
474	current_node_ = "";
475	}
476
477	protected void popOffStack()
478	{
479	if (!stack_.empty()) {
480	MyDocument saved = (MyDocument)stack_.pop();
481	current_doc_ = saved.doc;
482	current_contents_ = saved.contents;
483	current_node_ = saved.tagname;
484	} else {
485	current_doc_ = new Document();
486	current_contents_ = "";
487	current_node_ = "";
488	}
489	}
490
491
492	protected void deleteDocument(String doc_id)
493	throws IOException
494	{
495	debug("GS2LuceneDelete.deleteDocument(" + doc_id + ")");
496	debug("- Initial number of documents in index: " + writer_.numDocs());
497	writer_.deleteDocuments(new Term("docOID", doc_id));
498	debug("- Final number of documents in index: " + writer_.numDocs());
499	}
500
501
502	}
503	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: