source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/Indexer.java@ 12257

Last change on this file since 12257 was 12257, checked in by mdewsnip, 18 years ago

Added package definitions.

  • Property svn:keywords set to Author Date Id Revision
File size: 5.8 KB
Line 
1package org.nzdl.gsdl.LuceneWrap;
2
3
4import org.xml.sax.Attributes;
5import org.xml.sax.helpers.DefaultHandler;
6import org.xml.sax.InputSource;
7import org.xml.sax.SAXException;
8import org.xml.sax.XMLReader;
9
10import javax.xml.parsers.SAXParser;
11import javax.xml.parsers.SAXParserFactory;
12
13import org.apache.lucene.document.Document;
14import org.apache.lucene.document.Field;
15import org.apache.lucene.index.IndexWriter;
16import org.apache.lucene.analysis.standard.StandardAnalyzer;
17
18import java.util.Stack;
19import java.io.FileInputStream;
20import java.io.File;
21import java.io.StringReader;
22import java.net.URL;
23
24
25
26public class Indexer extends DefaultHandler
27{
28 IndexWriter writer_ = null;
29 SAXParser sax_parser_ = null;
30 String doc_tag_level_ = null;
31
32 Stack stack_ = null;
33 String path_ = "";
34
35 Document current_doc_ = null;
36 String current_node_ = "";
37 String indexable_current_node_ = "";
38 String current_contents_ = "";
39
40 protected String file_id_ = null;
41
42 /** pass in true if want to create a new index, false if want to use the existing one */
43 public Indexer (String doc_tag_level, File index_dir, boolean create)
44 {
45 doc_tag_level_ = doc_tag_level;
46
47 try {
48 stack_ = new Stack();
49 SAXParserFactory sax_factory = SAXParserFactory.newInstance();
50 sax_parser_ = sax_factory.newSAXParser();
51
52 XMLReader reader = sax_parser_.getXMLReader();
53 reader.setFeature("http://xml.org/sax/features/validation", false);
54
55 writer_ = new IndexWriter(index_dir.getPath(), new StandardAnalyzer(), create);
56 // by default, will only index 10,000 words per document
57 // Can throw out_of_memory errors
58 writer_.setMaxFieldLength(Integer.MAX_VALUE);
59 if (create) {
60 writer_.optimize();
61 }
62
63 } catch (Exception e) {
64 // do nothing!
65 }
66 }
67
68 /** index one document */
69 public void index (String file_id, File file)
70 {
71 file_id_ = file_id;
72 path_ = "";
73 String base_path = file.getPath();
74 base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
75
76 try {
77 sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
78 }
79 catch (Exception e) {
80 println("parse error:");
81 e.printStackTrace();
82 }
83 }
84
85 /** index one document stored as string*/
86 public void index (String xml_text)
87 {
88 file_id_ = "<xml doc on stdin>";
89 path_ = "";
90
91 try {
92 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
93 }
94 catch (Exception e) {
95 println("parse error:");
96 e.printStackTrace();
97 }
98 }
99
100 public void finish()
101 {
102 /** optimise the index */
103 try {
104 writer_.optimize();
105 writer_.close();
106 }
107 catch (Exception e) {
108 }
109 }
110
111 protected void print(String s)
112 {
113 System.out.print(s);
114 }
115
116 protected void println(String s)
117 {
118 System.out.println(s);
119 }
120
121 public void startDocument() throws SAXException
122 {
123 println("Starting to index " + file_id_);
124 print("[");
125 }
126
127 public void endDocument() throws SAXException
128 {
129 println("]");
130 println("... indexing finished.");
131 }
132
133 public void startElement(String uri, String localName, String qName, Attributes atts)
134 throws SAXException
135 {
136 path_ = appendPathLink(path_, qName, atts);
137
138 if (qName.equals(doc_tag_level_)) {
139 pushOnStack(); // start new doc
140 current_node_ = qName;
141 String node_id = atts.getValue("gs2:id");
142
143 print(" " + qName + ": " + node_id );
144 current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.NO));
145 }
146
147 if (XMLTagInfo.isIndexable(atts)) {
148 indexable_current_node_ = qName;
149 }
150 else {
151 indexable_current_node_ = "";
152 }
153
154 }
155 public void endElement(String uri, String localName, String qName) throws SAXException
156 {
157 if (qName.equals(indexable_current_node_)) {
158 current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED));
159 current_contents_ = "";
160 }
161
162 if (qName.equals(doc_tag_level_)) {
163 try {
164 writer_.addDocument(current_doc_);
165 }
166 catch (java.io.IOException e) {
167 e.printStackTrace();
168 }
169 popOffStack(); // end document
170 }
171
172 path_ = removePathLink(path_);
173 }
174
175 public void characters(char ch[], int start, int length) throws SAXException
176 {
177 String data = new String(ch, start, length).trim();
178 if (data.length() > 0 ) {
179 current_contents_ += data;
180 }
181 }
182
183 protected String appendPathLink(String path, String qName, Attributes atts)
184 {
185
186 path = path + "/"+qName;
187 if (atts.getLength()>0) {
188 String id = atts.getValue("gs2:id");
189 if (id != null) {
190 path += "[@gs2:id='"+id+"']";
191 }
192 else {
193 id = atts.getValue("gs3:id");
194 if (id != null) {
195 path += "[@gs3:id='"+id+"']";
196 }
197 }
198 }
199 return path;
200 }
201 protected String removePathLink(String path)
202 {
203
204 int i=path.lastIndexOf('/');
205 if (i==-1) {
206 path="";
207 } else {
208 path = path.substring(0, i);
209 }
210 return path;
211 }
212 /** these are what we save on the stack */
213 private class MyDocument
214 {
215 public Document doc = null;
216 public String contents = null;
217 public String tagname = "";
218
219 }
220
221 protected void pushOnStack()
222 {
223 if (current_doc_ != null) {
224 MyDocument save = new MyDocument();
225 save.doc = current_doc_;
226 save.contents = current_contents_;
227 save.tagname = current_node_;
228 stack_.push(save);
229 }
230 current_doc_ = new Document();
231 current_contents_ = "";
232 current_node_ = "";
233 }
234
235 protected void popOffStack()
236 {
237 if (!stack_.empty()) {
238 MyDocument saved = (MyDocument)stack_.pop();
239 current_doc_ = saved.doc;
240 current_contents_ = saved.contents;
241 current_node_ = saved.tagname;
242 } else {
243 current_doc_ = new Document();
244 current_contents_ = "";
245 current_node_ = "";
246 }
247 }
248
249
250}
251
252
Note: See TracBrowser for help on using the repository browser.