source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/Indexer.java@ 12376

Last change on this file since 12376 was 12376, checked in by mdewsnip, 18 years ago

Now stores term vectors (for the TX field only), to support query term occurrences. Many thanks to John Thompson and DL Consulting Ltd.

  • Property svn:keywords set to Author Date Id Revision
File size: 6.1 KB
Line 
1package org.nzdl.gsdl.LuceneWrap;
2
3
4import org.xml.sax.Attributes;
5import org.xml.sax.helpers.DefaultHandler;
6import org.xml.sax.InputSource;
7import org.xml.sax.SAXException;
8import org.xml.sax.XMLReader;
9
10import javax.xml.parsers.SAXParser;
11import javax.xml.parsers.SAXParserFactory;
12
13import org.apache.lucene.document.Document;
14import org.apache.lucene.document.Field;
15import org.apache.lucene.index.IndexWriter;
16import org.apache.lucene.analysis.standard.StandardAnalyzer;
17
18import java.util.Stack;
19import java.io.FileInputStream;
20import java.io.File;
21import java.io.StringReader;
22import java.net.URL;
23
24
25
26public class Indexer extends DefaultHandler
27{
28 IndexWriter writer_ = null;
29 SAXParser sax_parser_ = null;
30 String doc_tag_level_ = null;
31
32 Stack stack_ = null;
33 String path_ = "";
34
35 Document current_doc_ = null;
36 String current_node_ = "";
37 String indexable_current_node_ = "";
38 String current_contents_ = "";
39
40 protected String file_id_ = null;
41
42 /** pass in true if want to create a new index, false if want to use the existing one */
43 public Indexer (String doc_tag_level, File index_dir, boolean create)
44 {
45 doc_tag_level_ = doc_tag_level;
46
47 try {
48 stack_ = new Stack();
49 SAXParserFactory sax_factory = SAXParserFactory.newInstance();
50 sax_parser_ = sax_factory.newSAXParser();
51
52 XMLReader reader = sax_parser_.getXMLReader();
53 reader.setFeature("http://xml.org/sax/features/validation", false);
54
55 writer_ = new IndexWriter(index_dir.getPath(), new StandardAnalyzer(), create);
56 // by default, will only index 10,000 words per document
57 // Can throw out_of_memory errors
58 writer_.setMaxFieldLength(Integer.MAX_VALUE);
59 if (create) {
60 writer_.optimize();
61 }
62
63 } catch (Exception e) {
64 // do nothing!
65 }
66 }
67
68 /** index one document */
69 public void index (String file_id, File file)
70 {
71 file_id_ = file_id;
72 path_ = "";
73 String base_path = file.getPath();
74 base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
75
76 try {
77 sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
78 }
79 catch (Exception e) {
80 println("parse error:");
81 e.printStackTrace();
82 }
83 }
84
85 /** index one document stored as string*/
86 public void index (String xml_text)
87 {
88 file_id_ = "<xml doc on stdin>";
89 path_ = "";
90
91 try {
92 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
93 }
94 catch (Exception e) {
95 println("parse error:");
96 e.printStackTrace();
97 }
98 }
99
100 public void finish()
101 {
102 /** optimise the index */
103 try {
104 writer_.optimize();
105 writer_.close();
106 }
107 catch (Exception e) {
108 }
109 }
110
111 protected void print(String s)
112 {
113 System.out.print(s);
114 }
115
116 protected void println(String s)
117 {
118 System.out.println(s);
119 }
120
121 public void startDocument() throws SAXException
122 {
123 println("Starting to index " + file_id_);
124 print("[");
125 }
126
127 public void endDocument() throws SAXException
128 {
129 println("]");
130 println("... indexing finished.");
131 }
132
133 public void startElement(String uri, String localName, String qName, Attributes atts)
134 throws SAXException
135 {
136 path_ = appendPathLink(path_, qName, atts);
137
138 if (qName.equals(doc_tag_level_)) {
139 pushOnStack(); // start new doc
140 current_node_ = qName;
141 String node_id = atts.getValue("gs2:id");
142
143 print(" " + qName + ": " + node_id );
144 current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.NO));
145 }
146
147 if (XMLTagInfo.isIndexable(atts)) {
148 indexable_current_node_ = qName;
149 }
150 else {
151 indexable_current_node_ = "";
152 }
153
154 }
155 public void endElement(String uri, String localName, String qName) throws SAXException
156 {
157 if (qName.equals(indexable_current_node_))
158 {
159 current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES));
160 // We only need the term vector for the TX field
161 if (!qName.equals("TX"))
162 {
163 current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO));
164 }
165
166 current_contents_ = "";
167 }
168
169 if (qName.equals(doc_tag_level_)) {
170 try {
171 writer_.addDocument(current_doc_);
172 }
173 catch (java.io.IOException e) {
174 e.printStackTrace();
175 }
176 popOffStack(); // end document
177 }
178
179 path_ = removePathLink(path_);
180 }
181
182 public void characters(char ch[], int start, int length) throws SAXException
183 {
184 String data = new String(ch, start, length).trim();
185 if (data.length() > 0 ) {
186 current_contents_ += data;
187 }
188 }
189
190 protected String appendPathLink(String path, String qName, Attributes atts)
191 {
192
193 path = path + "/"+qName;
194 if (atts.getLength()>0) {
195 String id = atts.getValue("gs2:id");
196 if (id != null) {
197 path += "[@gs2:id='"+id+"']";
198 }
199 else {
200 id = atts.getValue("gs3:id");
201 if (id != null) {
202 path += "[@gs3:id='"+id+"']";
203 }
204 }
205 }
206 return path;
207 }
208 protected String removePathLink(String path)
209 {
210
211 int i=path.lastIndexOf('/');
212 if (i==-1) {
213 path="";
214 } else {
215 path = path.substring(0, i);
216 }
217 return path;
218 }
219 /** these are what we save on the stack */
220 private class MyDocument
221 {
222 public Document doc = null;
223 public String contents = null;
224 public String tagname = "";
225
226 }
227
228 protected void pushOnStack()
229 {
230 if (current_doc_ != null) {
231 MyDocument save = new MyDocument();
232 save.doc = current_doc_;
233 save.contents = current_contents_;
234 save.tagname = current_node_;
235 stack_.push(save);
236 }
237 current_doc_ = new Document();
238 current_contents_ = "";
239 current_node_ = "";
240 }
241
242 protected void popOffStack()
243 {
244 if (!stack_.empty()) {
245 MyDocument saved = (MyDocument)stack_.pop();
246 current_doc_ = saved.doc;
247 current_contents_ = saved.contents;
248 current_node_ = saved.tagname;
249 } else {
250 current_doc_ = new Document();
251 current_contents_ = "";
252 current_node_ = "";
253 }
254 }
255
256
257}
258
259
Note: See TracBrowser for help on using the repository browser.