Context Navigation

source: indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java@ 18132

Last change on this file since 18132 was 18132, checked in by kjdon, 15 years ago
we don't want to store the ZZ field again for sorting
Property svn:keywords set to `Author Date Id Revision`
File size: 10.1 KB

Line
1	/**********************************************************************
2	*
3	* GS2LuceneIndexer.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26
27	package org.greenstone.LuceneWrapper;
28
29
30	import java.io.*;
31	import java.util.Vector;
32
33	import org.xml.sax.Attributes;
34	import org.xml.sax.helpers.DefaultHandler;
35	import org.xml.sax.InputSource;
36	import org.xml.sax.SAXException;
37	import org.xml.sax.XMLReader;
38
39	import javax.xml.parsers.SAXParser;
40	import javax.xml.parsers.SAXParserFactory;
41
42	import org.apache.lucene.document.Document;
43	import org.apache.lucene.document.Field;
44	import org.apache.lucene.index.IndexWriter;
45	import org.apache.lucene.index.Term;
46	import org.apache.lucene.analysis.Analyzer;
47
48	import java.util.Stack;
49	import java.io.FileInputStream;
50	import java.io.File;
51	import java.io.StringReader;
52	import java.net.URL;
53
54
55	/**
56	* class for indexing XML generated by lucenebuildproc.pm
57	*/
58
59	public class GS2LuceneIndexer {
60
61	public static void main (String args[]) throws Exception
62	{
63	int verbosity = 1;
64	// Default is to edit the existing index
65	boolean create_new_index = false;
66
67	Vector filtered_args = new Vector();
68
69	int argc = args.length;
70	int i = 0;
71	while (i<argc) {
72	if (args[i].startsWith("-")) {
73
74	// -removeold causes the existing index to be overwritten
75	if (args[i].equals("-removeold")) {
76	create_new_index = true;
77	}
78
79	// -verbosity [num]
80	else if (args[i].equals("-verbosity")) {
81	i++;
82	if (i<argc) {
83	verbosity = Integer.parseInt(args[i]);
84	}
85	}
86	else {
87	System.out.println("Unrecognised option: " + args[i]);
88	}
89	}
90	else {
91	filtered_args.add((Object)args[i]);
92	}
93	i++;
94	}
95
96	if (filtered_args.size() != 3) {
97	System.out.println("Usage: java GS2LuceneIndexer [-removeold\|-verbosity [num]] doc-tag-level building_dir index");
98	return;
99	}
100
101	String doc_tag_level = (String)filtered_args.get(0);
102	String building_dirname = (String)filtered_args.get(1);
103	String index_dirname = (String)filtered_args.get(2);
104
105	String import_dirname = building_dirname + File.separator + "text";
106
107	File import_dir = new File(import_dirname);
108	File building_dir = new File(building_dirname);
109
110	if (!import_dir.exists()) {
111	System.out.println("Couldn't find import directory: "+import_dirname);
112	return;
113	}
114
115	File idx_dir = new File(building_dir.getPath()+File.separator+index_dirname+File.separator);
116	idx_dir.mkdir();
117
118	// Set up indexer
119	Indexer indexer = new Indexer(doc_tag_level, idx_dir, create_new_index);
120
121	// Read from stdin the files to process
122	try {
123	InputStreamReader isr = new InputStreamReader(System.in, "UTF-8");
124	BufferedReader brin = new BufferedReader(isr);
125
126	StringBuffer xml_text = new StringBuffer(1024);
127	String line = null;
128	while ((line = brin.readLine()) != null) {
129	xml_text.append(line);
130	if (line.startsWith("</Doc>")) {
131	indexer.index(xml_text.toString());
132	xml_text = new StringBuffer(1024);
133	}
134	}
135
136	brin.close();
137	isr.close();
138
139	} catch (IOException e) {
140	System.err.println("Error: unable to read from stdin");
141	e.printStackTrace();
142	}
143
144	indexer.finish();
145	}
146
147
148	static public class Indexer extends DefaultHandler
149	{
150	IndexWriter writer_ = null;
151	Analyzer analyzer_ = null;
152	SAXParser sax_parser_ = null;
153	String doc_tag_level_ = null;
154
155	Stack stack_ = null;
156	String path_ = "";
157
158	Document current_doc_ = null;
159	String current_node_ = "";
160	String current_doc_oid_ = "";
161	String indexable_current_node_ = "";
162	String current_contents_ = "";
163
164	protected String file_id_ = null;
165
166	static private String[] stop_words = GS2Analyzer.STOP_WORDS;
167
168	/** pass in true if want to create a new index, false if want to use the existing one */
169	public Indexer (String doc_tag_level, File index_dir, boolean create)
170	{
171	doc_tag_level_ = doc_tag_level;
172
173	try {
174	stack_ = new Stack();
175	SAXParserFactory sax_factory = SAXParserFactory.newInstance();
176	sax_parser_ = sax_factory.newSAXParser();
177
178	XMLReader reader = sax_parser_.getXMLReader();
179	reader.setFeature("http://xml.org/sax/features/validation", false);
180
181	analyzer_ = new GS2Analyzer(stop_words);
182
183	writer_ = new IndexWriter(index_dir.getPath(), analyzer_, create);
184	// by default, will only index 10,000 words per document
185	// Can throw out_of_memory errors
186	writer_.setMaxFieldLength(Integer.MAX_VALUE);
187	if (create) {
188	writer_.optimize();
189	}
190	}
191	catch (Exception e) {
192	// We need to know if creating/opening the index fails
193	e.printStackTrace();
194	}
195	}
196
197	/** index one document */
198	public void index (String file_id, File file)
199	{
200	file_id_ = file_id;
201	path_ = "";
202	String base_path = file.getPath();
203	base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
204
205	try {
206	sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
207	}
208	catch (Exception e) {
209	println("parse error:");
210	e.printStackTrace();
211	}
212	}
213
214	/** index one document stored as string*/
215	public void index (String xml_text)
216	{
217	file_id_ = "<xml doc on stdin>";
218	path_ = "";
219
220	try {
221	sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
222	}
223	catch (Exception e) {
224	println("parse error:");
225	e.printStackTrace();
226	}
227	}
228
229	public void finish()
230	{
231	/** optimise the index */
232	try {
233	writer_.optimize();
234	writer_.close();
235	}
236	catch (Exception e) {
237	}
238	}
239
240	protected void print(String s)
241	{
242	System.out.print(s);
243	}
244
245	protected void println(String s)
246	{
247	System.out.println(s);
248	}
249
250	public void startDocument() throws SAXException
251	{
252	println("Starting to index " + file_id_);
253	print("[");
254	}
255
256	public void endDocument() throws SAXException
257	{
258	println("]");
259	println("... indexing finished.");
260	}
261
262	public void startElement(String uri, String localName, String qName, Attributes atts)
263	throws SAXException
264	{
265	path_ = appendPathLink(path_, qName, atts);
266
267	if (qName.equals(doc_tag_level_)) {
268	pushOnStack(); // start new doc
269	current_node_ = qName;
270
271	String node_id = atts.getValue("gs2:id");
272	print(" " + qName + ": " + node_id );
273	current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED));
274
275	current_doc_oid_ = atts.getValue("gs2:docOID");
276	current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.UN_TOKENIZED));
277	}
278
279	if (isIndexable(atts)) {
280	indexable_current_node_ = qName;
281	}
282	else {
283	indexable_current_node_ = "";
284	}
285	}
286
287	public static boolean isIndexable(Attributes atts)
288	{
289	boolean is_indexable = false;
290
291	String index = atts.getValue("index");
292	if (index!=null) {
293	if (index.equals("1")) {
294	is_indexable = true;
295	}
296	}
297	return is_indexable;
298	}
299
300	public void endElement(String uri, String localName, String qName) throws SAXException
301	{
302	if (qName.equals(indexable_current_node_))
303	{
304	current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
305	// The byXX fields are used for sorting search results
306	// We don't want to do that for Text or AllFields fields
307	// They need to be untokenised for sorting
308	if (!qName.equals("TX") && !qName.equals("ZZ"))
309	{
310	current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO));
311	}
312
313	current_contents_ = "";
314	}
315
316	if (qName.equals(doc_tag_level_)) {
317	try {
318	writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_, analyzer_);
319	}
320	catch (java.io.IOException e) {
321	e.printStackTrace();
322	}
323	popOffStack(); // end document
324	}
325
326	path_ = removePathLink(path_);
327	}
328
329	public void characters(char ch[], int start, int length) throws SAXException
330	{
331	String data = new String(ch, start, length).trim();
332	if (data.length() > 0 ) {
333	current_contents_ += data;
334	}
335	}
336
337	protected String appendPathLink(String path, String qName, Attributes atts)
338	{
339
340	path = path + "/"+qName;
341	if (atts.getLength()>0) {
342	String id = atts.getValue("gs2:id");
343	if (id != null) {
344	path += "[@gs2:id='"+id+"']";
345	}
346	else {
347	id = atts.getValue("gs3:id");
348	if (id != null) {
349	path += "[@gs3:id='"+id+"']";
350	}
351	}
352	}
353	return path;
354	}
355
356	protected String removePathLink(String path)
357	{
358
359	int i=path.lastIndexOf('/');
360	if (i==-1) {
361	path="";
362	} else {
363	path = path.substring(0, i);
364	}
365	return path;
366	}
367
368
369	/** these are what we save on the stack */
370	private class MyDocument
371	{
372	public Document doc = null;
373	public String contents = null;
374	public String tagname = "";
375
376	}
377
378
379	protected void pushOnStack()
380	{
381	if (current_doc_ != null) {
382	MyDocument save = new MyDocument();
383	save.doc = current_doc_;
384	save.contents = current_contents_;
385	save.tagname = current_node_;
386	stack_.push(save);
387	}
388	current_doc_ = new Document();
389	current_contents_ = "";
390	current_node_ = "";
391	}
392
393	protected void popOffStack()
394	{
395	if (!stack_.empty()) {
396	MyDocument saved = (MyDocument)stack_.pop();
397	current_doc_ = saved.doc;
398	current_contents_ = saved.contents;
399	current_node_ = saved.tagname;
400	} else {
401	current_doc_ = new Document();
402	current_contents_ = "";
403	current_node_ = "";
404	}
405	}
406	}
407	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: