source: trunk/gsdl/src/java/org/nzdl/gsdl/LuceneWrap/Indexer.java@ 11245

Last change on this file since 11245 was 11245, checked in by kjdon, 18 years ago

by default, lucene indexer will only index the first 10,000 words of a document, to avoid out of memory errors. I have set the max doc lenght to be max integer value. hope this is ok.

  • Property svn:keywords set to Author Date Id Revision
File size: 5.7 KB
Line 
1
2import org.xml.sax.Attributes;
3import org.xml.sax.helpers.DefaultHandler;
4import org.xml.sax.InputSource;
5import org.xml.sax.SAXException;
6import org.xml.sax.XMLReader;
7
8import javax.xml.parsers.SAXParser;
9import javax.xml.parsers.SAXParserFactory;
10
11import org.apache.lucene.document.Document;
12import org.apache.lucene.document.Field;
13import org.apache.lucene.index.IndexWriter;
14import org.apache.lucene.analysis.standard.StandardAnalyzer;
15
16import java.util.Stack;
17import java.io.FileInputStream;
18import java.io.File;
19import java.io.StringReader;
20import java.net.URL;
21
22
23
24public class Indexer extends DefaultHandler
25{
26 IndexWriter writer_ = null;
27 SAXParser sax_parser_ = null;
28 String doc_tag_level_ = null;
29
30 Stack stack_ = null;
31 String path_ = "";
32
33 Document current_doc_ = null;
34 String current_node_ = "";
35 String indexable_current_node_ = "";
36 String current_contents_ = "";
37
38 protected String file_id_ = null;
39
40 /** pass in true if want to create a new index, false if want to use the existing one */
41 public Indexer (String doc_tag_level, File index_dir, boolean create)
42 {
43 doc_tag_level_ = doc_tag_level;
44
45 try {
46 stack_ = new Stack();
47 SAXParserFactory sax_factory = SAXParserFactory.newInstance();
48 sax_parser_ = sax_factory.newSAXParser();
49
50 XMLReader reader = sax_parser_.getXMLReader();
51 reader.setFeature("http://xml.org/sax/features/validation", false);
52
53 writer_ = new IndexWriter(index_dir.getPath(), new StandardAnalyzer(), create);
54 // by default, will only index 10,000 words per document
55 // Can throw out_of_memory errors
56 writer_.maxFieldLength = Integer.MAX_VALUE;
57 if (create) {
58 writer_.optimize();
59 }
60
61 } catch (Exception e) {
62 // do nothing!
63 }
64 }
65
66 /** index one document */
67 public void index (String file_id, File file)
68 {
69 file_id_ = file_id;
70 path_ = "";
71 String base_path = file.getPath();
72 base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
73
74 try {
75 sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
76 }
77 catch (Exception e) {
78 println("parse error:");
79 e.printStackTrace();
80 }
81 }
82
83 /** index one document stored as string*/
84 public void index (String xml_text)
85 {
86 file_id_ = "<xml doc on stdin>";
87 path_ = "";
88
89 try {
90 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
91 }
92 catch (Exception e) {
93 println("parse error:");
94 e.printStackTrace();
95 }
96 }
97
98 public void finish()
99 {
100 /** optimise the index */
101 try {
102 writer_.optimize();
103 writer_.close();
104 }
105 catch (Exception e) {
106 }
107 }
108
109 protected void print(String s)
110 {
111 System.out.print(s);
112 }
113
114 protected void println(String s)
115 {
116 System.out.println(s);
117 }
118
119 public void startDocument() throws SAXException
120 {
121 println("Starting to index " + file_id_);
122 print("[");
123 }
124
125 public void endDocument() throws SAXException
126 {
127 println("]");
128 println("... indexing finished.");
129 }
130
131 public void startElement(String uri, String localName, String qName, Attributes atts)
132 throws SAXException
133 {
134 path_ = appendPathLink(path_, qName, atts);
135
136 if (qName.equals(doc_tag_level_)) {
137 pushOnStack(); // start new doc
138 current_node_ = qName;
139 String node_id = atts.getValue("gs2:id");
140
141 print(" " + qName + ": " + node_id );
142 current_doc_.add(Field.UnIndexed("nodeID", node_id));
143 }
144
145 if (XMLTagInfo.isIndexable(atts)) {
146 indexable_current_node_ = qName;
147 }
148 else {
149 indexable_current_node_ = "";
150 }
151
152 }
153 public void endElement(String uri, String localName, String qName) throws SAXException
154 {
155 if (qName.equals(indexable_current_node_)) {
156 current_doc_.add(Field.UnStored(qName, current_contents_));
157 current_contents_ = "";
158 }
159
160 if (qName.equals(doc_tag_level_)) {
161 try {
162 writer_.addDocument(current_doc_);
163 }
164 catch (java.io.IOException e) {
165 e.printStackTrace();
166 }
167 popOffStack(); // end document
168 }
169
170 path_ = removePathLink(path_);
171 }
172
173 public void characters(char ch[], int start, int length) throws SAXException
174 {
175 String data = new String(ch, start, length).trim();
176 if (data.length() > 0 ) {
177 current_contents_ += data;
178 }
179 }
180
181 protected String appendPathLink(String path, String qName, Attributes atts)
182 {
183
184 path = path + "/"+qName;
185 if (atts.getLength()>0) {
186 String id = atts.getValue("gs2:id");
187 if (id != null) {
188 path += "[@gs2:id='"+id+"']";
189 }
190 else {
191 id = atts.getValue("gs3:id");
192 if (id != null) {
193 path += "[@gs3:id='"+id+"']";
194 }
195 }
196 }
197 return path;
198 }
199 protected String removePathLink(String path)
200 {
201
202 int i=path.lastIndexOf('/');
203 if (i==-1) {
204 path="";
205 } else {
206 path = path.substring(0, i);
207 }
208 return path;
209 }
210 /** these are what we save on the stack */
211 private class MyDocument
212 {
213 public Document doc = null;
214 public String contents = null;
215 public String tagname = "";
216
217 }
218
219 protected void pushOnStack()
220 {
221 if (current_doc_ != null) {
222 MyDocument save = new MyDocument();
223 save.doc = current_doc_;
224 save.contents = current_contents_;
225 save.tagname = current_node_;
226 stack_.push(save);
227 }
228 current_doc_ = new Document();
229 current_contents_ = "";
230 current_node_ = "";
231 }
232
233 protected void popOffStack()
234 {
235 if (!stack_.empty()) {
236 MyDocument saved = (MyDocument)stack_.pop();
237 current_doc_ = saved.doc;
238 current_contents_ = saved.contents;
239 current_node_ = saved.tagname;
240 } else {
241 current_doc_ = new Document();
242 current_contents_ = "";
243 current_node_ = "";
244 }
245 }
246
247
248}
249
250
Note: See TracBrowser for help on using the repository browser.