source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/Indexer.java@ 10164

Last change on this file since 10164 was 10164, checked in by davidb, 19 years ago

Code upgraded to support incremental building. This mostly involves parsing
the -create flag in main and passing it down to lower level functions (as
a boolean flag).

  • Property svn:keywords set to Author Date Id Revision
File size: 5.5 KB
Line 
1
2import org.xml.sax.Attributes;
3import org.xml.sax.helpers.DefaultHandler;
4import org.xml.sax.InputSource;
5import org.xml.sax.SAXException;
6import org.xml.sax.XMLReader;
7
8import javax.xml.parsers.SAXParser;
9import javax.xml.parsers.SAXParserFactory;
10
11import org.apache.lucene.document.Document;
12import org.apache.lucene.document.Field;
13import org.apache.lucene.index.IndexWriter;
14import org.apache.lucene.analysis.standard.StandardAnalyzer;
15
16import java.util.Stack;
17import java.io.FileInputStream;
18import java.io.File;
19import java.io.StringReader;
20import java.net.URL;
21
22
23
24public class Indexer extends DefaultHandler
25{
26 IndexWriter writer_ = null;
27 SAXParser sax_parser_ = null;
28 String doc_tag_level_ = null;
29
30 Stack stack_ = null;
31 String path_ = "";
32
33 Document current_doc_ = null;
34 String current_node_ = "";
35 String indexable_current_node_ = "";
36 String current_contents_ = "";
37
38 protected String file_id_ = null;
39
40 /** pass in true if want to create a new index, false if want to use the existing one */
41 public Indexer (String doc_tag_level, File index_dir, boolean create)
42 {
43 doc_tag_level_ = doc_tag_level;
44
45 try {
46 stack_ = new Stack();
47 SAXParserFactory sax_factory = SAXParserFactory.newInstance();
48 sax_parser_ = sax_factory.newSAXParser();
49
50 XMLReader reader = sax_parser_.getXMLReader();
51 reader.setFeature("http://xml.org/sax/features/validation", false);
52
53 writer_ = new IndexWriter(index_dir.getPath(), new StandardAnalyzer(), create);
54 if (create) {
55 writer_.optimize();
56 }
57
58 } catch (Exception e) {
59 // do nothing!
60 }
61 }
62
63 /** index one document */
64 public void index (String file_id, File file)
65 {
66 file_id_ = file_id;
67 path_ = "";
68 String base_path = file.getPath();
69 base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
70
71 try {
72 sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
73 }
74 catch (Exception e) {
75 println("parse error:");
76 e.printStackTrace();
77 }
78 }
79
80 /** index one document stored as string*/
81 public void index (String xml_text)
82 {
83 file_id_ = "<xml doc on stdin>";
84 path_ = "";
85
86 try {
87 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
88 }
89 catch (Exception e) {
90 println("parse error:");
91 e.printStackTrace();
92 }
93 }
94
95 public void finish()
96 {
97 /** optimise the index */
98 try {
99 writer_.optimize();
100 writer_.close();
101 }
102 catch (Exception e) {
103 }
104 }
105
106 protected void print(String s)
107 {
108 System.out.print(s);
109 }
110
111 protected void println(String s)
112 {
113 System.out.println(s);
114 }
115
116 public void startDocument() throws SAXException
117 {
118 println("Starting to index " + file_id_);
119 print("[");
120 }
121
122 public void endDocument() throws SAXException
123 {
124 println("]");
125 println("... indexing finished.");
126 }
127
128 public void startElement(String uri, String localName, String qName, Attributes atts)
129 throws SAXException
130 {
131 path_ = appendPathLink(path_, qName, atts);
132
133 if (qName.equals(doc_tag_level_)) {
134 pushOnStack(); // start new doc
135 current_node_ = qName;
136 String node_id = atts.getValue("gs2:id");
137
138 print(" " + qName + ": " + node_id );
139 current_doc_.add(Field.UnIndexed("nodeID", node_id));
140 }
141
142 if (XMLTagInfo.isIndexable(atts)) {
143 indexable_current_node_ = qName;
144 }
145 else {
146 indexable_current_node_ = "";
147 }
148
149 }
150 public void endElement(String uri, String localName, String qName) throws SAXException
151 {
152 if (qName.equals(indexable_current_node_)) {
153 current_doc_.add(Field.UnStored(qName, current_contents_));
154 current_contents_ = "";
155 }
156
157 if (qName.equals(doc_tag_level_)) {
158 try {
159 writer_.addDocument(current_doc_);
160 }
161 catch (java.io.IOException e) {
162 e.printStackTrace();
163 }
164 popOffStack(); // end document
165 }
166
167 path_ = removePathLink(path_);
168 }
169
170 public void characters(char ch[], int start, int length) throws SAXException
171 {
172 String data = new String(ch, start, length).trim();
173 if (data.length() > 0 ) {
174 current_contents_ += data;
175 }
176 }
177
178 protected String appendPathLink(String path, String qName, Attributes atts)
179 {
180
181 path = path + "/"+qName;
182 if (atts.getLength()>0) {
183 String id = atts.getValue("gs2:id");
184 if (id != null) {
185 path += "[@gs2:id='"+id+"']";
186 }
187 else {
188 id = atts.getValue("gs3:id");
189 if (id != null) {
190 path += "[@gs3:id='"+id+"']";
191 }
192 }
193 }
194 return path;
195 }
196 protected String removePathLink(String path)
197 {
198
199 int i=path.lastIndexOf('/');
200 if (i==-1) {
201 path="";
202 } else {
203 path = path.substring(0, i);
204 }
205 return path;
206 }
207 /** these are what we save on the stack */
208 private class MyDocument
209 {
210 public Document doc = null;
211 public String contents = null;
212 public String tagname = "";
213
214 }
215
216 protected void pushOnStack()
217 {
218 if (current_doc_ != null) {
219 MyDocument save = new MyDocument();
220 save.doc = current_doc_;
221 save.contents = current_contents_;
222 save.tagname = current_node_;
223 stack_.push(save);
224 }
225 current_doc_ = new Document();
226 current_contents_ = "";
227 current_node_ = "";
228 }
229
230 protected void popOffStack()
231 {
232 if (!stack_.empty()) {
233 MyDocument saved = (MyDocument)stack_.pop();
234 current_doc_ = saved.doc;
235 current_contents_ = saved.contents;
236 current_node_ = saved.tagname;
237 } else {
238 current_doc_ = new Document();
239 current_contents_ = "";
240 current_node_ = "";
241 }
242 }
243
244
245}
246
247
Note: See TracBrowser for help on using the repository browser.