source: trunk/gsdl/src/java/org/nzdl/gsdl/LuceneWrap/Indexer.java@ 12255

Last change on this file since 12255 was 12255, checked in by mdewsnip, 18 years ago

Upgraded the version of Lucene from 1.4.1 to 2.0.0... what's the worst that could happen?

  • Property svn:keywords set to Author Date Id Revision
File size: 5.7 KB
Line 
1
2import org.xml.sax.Attributes;
3import org.xml.sax.helpers.DefaultHandler;
4import org.xml.sax.InputSource;
5import org.xml.sax.SAXException;
6import org.xml.sax.XMLReader;
7
8import javax.xml.parsers.SAXParser;
9import javax.xml.parsers.SAXParserFactory;
10
11import org.apache.lucene.document.Document;
12import org.apache.lucene.document.Field;
13import org.apache.lucene.index.IndexWriter;
14import org.apache.lucene.analysis.standard.StandardAnalyzer;
15
16import java.util.Stack;
17import java.io.FileInputStream;
18import java.io.File;
19import java.io.StringReader;
20import java.net.URL;
21
22
23
24public class Indexer extends DefaultHandler
25{
26 IndexWriter writer_ = null;
27 SAXParser sax_parser_ = null;
28 String doc_tag_level_ = null;
29
30 Stack stack_ = null;
31 String path_ = "";
32
33 Document current_doc_ = null;
34 String current_node_ = "";
35 String indexable_current_node_ = "";
36 String current_contents_ = "";
37
38 protected String file_id_ = null;
39
40 /** pass in true if want to create a new index, false if want to use the existing one */
41 public Indexer (String doc_tag_level, File index_dir, boolean create)
42 {
43 doc_tag_level_ = doc_tag_level;
44
45 try {
46 stack_ = new Stack();
47 SAXParserFactory sax_factory = SAXParserFactory.newInstance();
48 sax_parser_ = sax_factory.newSAXParser();
49
50 XMLReader reader = sax_parser_.getXMLReader();
51 reader.setFeature("http://xml.org/sax/features/validation", false);
52
53 writer_ = new IndexWriter(index_dir.getPath(), new StandardAnalyzer(), create);
54 // by default, will only index 10,000 words per document
55 // Can throw out_of_memory errors
56 writer_.setMaxFieldLength(Integer.MAX_VALUE);
57 if (create) {
58 writer_.optimize();
59 }
60
61 } catch (Exception e) {
62 // do nothing!
63 }
64 }
65
66 /** index one document */
67 public void index (String file_id, File file)
68 {
69 file_id_ = file_id;
70 path_ = "";
71 String base_path = file.getPath();
72 base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
73
74 try {
75 sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
76 }
77 catch (Exception e) {
78 println("parse error:");
79 e.printStackTrace();
80 }
81 }
82
83 /** index one document stored as string*/
84 public void index (String xml_text)
85 {
86 file_id_ = "<xml doc on stdin>";
87 path_ = "";
88
89 try {
90 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
91 }
92 catch (Exception e) {
93 println("parse error:");
94 e.printStackTrace();
95 }
96 }
97
98 public void finish()
99 {
100 /** optimise the index */
101 try {
102 writer_.optimize();
103 writer_.close();
104 }
105 catch (Exception e) {
106 }
107 }
108
109 protected void print(String s)
110 {
111 System.out.print(s);
112 }
113
114 protected void println(String s)
115 {
116 System.out.println(s);
117 }
118
119 public void startDocument() throws SAXException
120 {
121 println("Starting to index " + file_id_);
122 print("[");
123 }
124
125 public void endDocument() throws SAXException
126 {
127 println("]");
128 println("... indexing finished.");
129 }
130
131 public void startElement(String uri, String localName, String qName, Attributes atts)
132 throws SAXException
133 {
134 path_ = appendPathLink(path_, qName, atts);
135
136 if (qName.equals(doc_tag_level_)) {
137 pushOnStack(); // start new doc
138 current_node_ = qName;
139 String node_id = atts.getValue("gs2:id");
140
141 print(" " + qName + ": " + node_id );
142 current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.NO));
143 }
144
145 if (XMLTagInfo.isIndexable(atts)) {
146 indexable_current_node_ = qName;
147 }
148 else {
149 indexable_current_node_ = "";
150 }
151
152 }
153 public void endElement(String uri, String localName, String qName) throws SAXException
154 {
155 if (qName.equals(indexable_current_node_)) {
156 current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED));
157 current_contents_ = "";
158 }
159
160 if (qName.equals(doc_tag_level_)) {
161 try {
162 writer_.addDocument(current_doc_);
163 }
164 catch (java.io.IOException e) {
165 e.printStackTrace();
166 }
167 popOffStack(); // end document
168 }
169
170 path_ = removePathLink(path_);
171 }
172
173 public void characters(char ch[], int start, int length) throws SAXException
174 {
175 String data = new String(ch, start, length).trim();
176 if (data.length() > 0 ) {
177 current_contents_ += data;
178 }
179 }
180
181 protected String appendPathLink(String path, String qName, Attributes atts)
182 {
183
184 path = path + "/"+qName;
185 if (atts.getLength()>0) {
186 String id = atts.getValue("gs2:id");
187 if (id != null) {
188 path += "[@gs2:id='"+id+"']";
189 }
190 else {
191 id = atts.getValue("gs3:id");
192 if (id != null) {
193 path += "[@gs3:id='"+id+"']";
194 }
195 }
196 }
197 return path;
198 }
199 protected String removePathLink(String path)
200 {
201
202 int i=path.lastIndexOf('/');
203 if (i==-1) {
204 path="";
205 } else {
206 path = path.substring(0, i);
207 }
208 return path;
209 }
210 /** these are what we save on the stack */
211 private class MyDocument
212 {
213 public Document doc = null;
214 public String contents = null;
215 public String tagname = "";
216
217 }
218
219 protected void pushOnStack()
220 {
221 if (current_doc_ != null) {
222 MyDocument save = new MyDocument();
223 save.doc = current_doc_;
224 save.contents = current_contents_;
225 save.tagname = current_node_;
226 stack_.push(save);
227 }
228 current_doc_ = new Document();
229 current_contents_ = "";
230 current_node_ = "";
231 }
232
233 protected void popOffStack()
234 {
235 if (!stack_.empty()) {
236 MyDocument saved = (MyDocument)stack_.pop();
237 current_doc_ = saved.doc;
238 current_contents_ = saved.contents;
239 current_node_ = saved.tagname;
240 } else {
241 current_doc_ = new Document();
242 current_contents_ = "";
243 current_node_ = "";
244 }
245 }
246
247
248}
249
250
Note: See TracBrowser for help on using the repository browser.