source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/Indexer.java@ 8521

Last change on this file since 8521 was 8521, checked in by davidb, 19 years ago

java bridge from how greensotne likes to do indexing and querying to lucenes classes

  • Property svn:keywords set to Author Date Id Revision
File size: 5.4 KB
Line 
1
2import org.xml.sax.Attributes;
3import org.xml.sax.helpers.DefaultHandler;
4import org.xml.sax.InputSource;
5import org.xml.sax.SAXException;
6import org.xml.sax.XMLReader;
7
8import javax.xml.parsers.SAXParser;
9import javax.xml.parsers.SAXParserFactory;
10
11import org.apache.lucene.document.Document;
12import org.apache.lucene.document.Field;
13import org.apache.lucene.index.IndexWriter;
14import org.apache.lucene.analysis.standard.StandardAnalyzer;
15
16import java.util.Stack;
17import java.io.FileInputStream;
18import java.io.File;
19import java.io.StringReader;
20import java.net.URL;
21
22
23
24public class Indexer extends DefaultHandler
25{
26 IndexWriter writer_ = null;
27 SAXParser sax_parser_ = null;
28 String doc_tag_level_ = null;
29
30 Stack stack_ = null;
31 String path_ = "";
32
33 Document current_doc_ = null;
34 String current_node_ = "";
35 String indexable_current_node_ = "";
36 String current_contents_ = "";
37
38 protected String file_id_ = null;
39
40 /** pass in true if want to create a new index, false if want to use the existing one */
41 public Indexer (String doc_tag_level, File index_dir, boolean create)
42 {
43 doc_tag_level_ = doc_tag_level;
44
45 try {
46 stack_ = new Stack();
47 SAXParserFactory sax_factory = SAXParserFactory.newInstance();
48 sax_parser_ = sax_factory.newSAXParser();
49
50 XMLReader reader = sax_parser_.getXMLReader();
51 reader.setFeature("http://xml.org/sax/features/validation", false);
52
53 writer_ = new IndexWriter(index_dir.getPath(), new StandardAnalyzer(), create);
54 if (create) {
55 writer_.optimize();
56 }
57
58 } catch (Exception e) {
59 // do nothing!
60 }
61 }
62
63 /** index one document */
64 public void index (String file_id, File file)
65 {
66 file_id_ = file_id;
67 path_ = "";
68 String base_path = file.getPath();
69 base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
70
71 try {
72 sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
73 }
74 catch (Exception e) {
75 println("parse error:");
76 e.printStackTrace();
77 }
78 }
79
80 /** index one document stored as string*/
81 public void index (String xml_text)
82 {
83 file_id_ = "<xml doc as string>";
84 path_ = "";
85
86 try {
87 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
88 }
89 catch (Exception e) {
90 println("parse error:");
91 e.printStackTrace();
92 }
93 }
94
95 public void finish()
96 {
97 /** optimise the index */
98 try {
99 writer_.optimize();
100 writer_.close();
101 }
102 catch (Exception e) {
103 }
104 }
105
106 protected void println(String s)
107 {
108 System.out.println(s);
109 }
110
111 public void startDocument() throws SAXException
112 {
113 println("Starting to index " + file_id_);
114 }
115
116 public void endDocument() throws SAXException
117 {
118 println("... indexing finished.");
119 }
120
121 public void startElement(String uri, String localName, String qName, Attributes atts)
122 throws SAXException
123 {
124 path_ = appendPathLink(path_, qName, atts);
125
126 if (qName.equals(doc_tag_level_)) {
127 pushOnStack(); // start new doc
128 current_node_ = qName;
129 String node_id = atts.getValue("gs2:id");
130
131 System.out.println("**** Indexing "+ qName + " " + node_id );
132 current_doc_.add(Field.UnIndexed("nodeID", node_id));
133 }
134
135 if (XMLTagInfo.isIndexable(atts)) {
136 indexable_current_node_ = qName;
137 }
138 else {
139 indexable_current_node_ = "";
140 }
141
142 }
143 public void endElement(String uri, String localName, String qName) throws SAXException
144 {
145 if (qName.equals(indexable_current_node_)) {
146 current_doc_.add(Field.UnStored(qName, current_contents_));
147 current_contents_ = "";
148 }
149
150 if (qName.equals(doc_tag_level_)) {
151 try {
152 writer_.addDocument(current_doc_);
153 }
154 catch (java.io.IOException e) {
155 e.printStackTrace();
156 }
157 popOffStack(); // end document
158 }
159
160 path_ = removePathLink(path_);
161 }
162
163 public void characters(char ch[], int start, int length) throws SAXException
164 {
165 String data = new String(ch, start, length).trim();
166 if (data.length() > 0 ) {
167 current_contents_ += data;
168 }
169 }
170
171 protected String appendPathLink(String path, String qName, Attributes atts)
172 {
173
174 path = path + "/"+qName;
175 if (atts.getLength()>0) {
176 String id = atts.getValue("gs2:id");
177 if (id != null) {
178 path += "[@gs2:id='"+id+"']";
179 }
180 else {
181 id = atts.getValue("gs3:id");
182 if (id != null) {
183 path += "[@gs3:id='"+id+"']";
184 }
185 }
186 }
187 return path;
188 }
189 protected String removePathLink(String path)
190 {
191
192 int i=path.lastIndexOf('/');
193 if (i==-1) {
194 path="";
195 } else {
196 path = path.substring(0, i);
197 }
198 return path;
199 }
200 /** these are what we save on the stack */
201 private class MyDocument
202 {
203 public Document doc = null;
204 public String contents = null;
205 public String tagname = "";
206
207 }
208
209 protected void pushOnStack()
210 {
211 if (current_doc_ != null) {
212 MyDocument save = new MyDocument();
213 save.doc = current_doc_;
214 save.contents = current_contents_;
215 save.tagname = current_node_;
216 stack_.push(save);
217 }
218 current_doc_ = new Document();
219 current_contents_ = "";
220 current_node_ = "";
221 }
222
223 protected void popOffStack()
224 {
225 if (!stack_.empty()) {
226 MyDocument saved = (MyDocument)stack_.pop();
227 current_doc_ = saved.doc;
228 current_contents_ = saved.contents;
229 current_node_ = saved.tagname;
230 } else {
231 current_doc_ = new Document();
232 current_contents_ = "";
233 current_node_ = "";
234 }
235 }
236
237
238}
239
240
Note: See TracBrowser for help on using the repository browser.