source: greenstone3/trunk/web/sites/localsite/collect/gberg/java/Indexer.java@ 20341

Last change on this file since 20341 was 20341, checked in by davidb, 15 years ago

Update of Indexer.java code to work with latest version of Lucene API

  • Property svn:keywords set to Author Date Id Revision
File size: 5.5 KB
Line 
1import org.greenstone.gsdl3.util.GSEntityResolver;
2
3import org.xml.sax.Attributes;
4import org.xml.sax.helpers.DefaultHandler;
5import org.xml.sax.InputSource;
6import org.xml.sax.SAXException;
7//import org.xml.sax.SAXParseException;
8import javax.xml.parsers.SAXParser;
9import javax.xml.parsers.SAXParserFactory;
10
11import org.apache.lucene.document.Document;
12import org.apache.lucene.document.Field;
13//import org.apache.lucene.document.DateField;
14import org.apache.lucene.index.IndexWriter;
15import org.apache.lucene.analysis.standard.StandardAnalyzer;
16//import org.apache.lucene.analysis.SimpleAnalyzer;
17
18import java.util.Stack;
19import java.io.FileInputStream;
20import java.io.File;
21import java.net.URL;
22
23
24
25public class Indexer extends DefaultHandler {
26 IndexWriter writer = null;
27 SAXParser sax_parser = null;
28 Stack stack = null;
29 String path = "";
30 String current_node = "";
31 String current_contents = "";
32 Document current_doc = null;
33 String scope = "";
34 protected String file_id = null;
35 private String base_path = null;
36 /** pass in true if want to create a new index, false if want to use the existing one */
37 public Indexer (File index_dir, boolean create) {
38 try {
39 stack = new Stack();
40 SAXParserFactory sax_factory = SAXParserFactory.newInstance();
41 sax_parser = sax_factory.newSAXParser();
42 writer = new IndexWriter(index_dir.getPath(), new StandardAnalyzer(), create);
43 if (create) {
44 writer.optimize();
45 }
46
47 } catch (Exception e) {
48
49 }
50 }
51
52 /** index one document */
53 public void index (String file_id, File file) {
54 this.file_id = file_id;
55 this.path = "";
56 this.base_path = file.getPath();
57 this.base_path = this.base_path.substring(0, this.base_path.lastIndexOf(File.separatorChar));
58 try {
59 sax_parser.parse(new InputSource(new FileInputStream(file)), this);
60 }
61 catch (Exception e) {
62 println("parse error:");
63 e.printStackTrace();
64 }
65 }
66
67 /** optimise the index */
68 public void finish() {
69 try {
70 writer.optimize();
71 writer.close();
72 } catch (Exception e) {}
73 }
74
75 protected void println(String s) { System.out.println(s); }
76
77 public void startDocument() throws SAXException {
78 println("Starting to index " + file_id);
79 }
80 public void endDocument() throws SAXException {
81 println("... indexing finished.");
82 }
83 public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
84 path = appendPathLink(path, qName, atts);
85 if (XMLTagInfo.isScopable(qName)) {
86 scope = qName;
87 }
88 if (XMLTagInfo.isIndexable(qName)) {
89 pushOnStack();
90 current_node = qName;
91 System.out.println("going to index "+qName );
92 String node_id = "";
93 String id = "<"+qName;
94 for (int i=0; i<atts.getLength(); i++) {
95 String name = atts.getQName(i);
96 String value = atts.getValue(i);
97 if (name!=null && value != null) {
98 id += " "+name+"="+value;
99 }
100 if (name.equals("gs3:id")) {
101 node_id = value;
102 }
103 }
104 id += "/>";
105
106 if (scope.equals(qName)) {
107
108 current_doc.add(new Field("nodeID", this.file_id+"."+qName,
109 Field.Store.YES,Field.Index.NO));
110 } else {
111 current_doc.add(new Field("nodeID", this.file_id+"."+scope+"."+qName+"."+node_id,
112 Field.Store.YES,Field.Index.NO));
113 }
114 }
115 }
116 public void endElement(String uri, String localName, String qName) throws SAXException {
117 if (XMLTagInfo.isIndexable(qName) && qName.equals(current_node)) {
118 current_doc.add(new Field("content", current_contents,
119 Field.Store.NO,Field.Index.TOKENIZED));
120 try {
121 writer.addDocument(current_doc);
122 } catch (java.io.IOException e) {
123 e.printStackTrace();
124 }
125 popOffStack();
126 }
127
128 path = removePathLink(path);
129 }
130
131 public void characters(char ch[], int start, int length) throws SAXException {
132 String data = new String(ch, start, length).trim();
133 if (data.length() > 0 ) {
134 current_contents += data;
135 }
136 }
137
138 protected String appendPathLink(String path, String qName, Attributes atts) {
139
140 path = path + "/"+qName;
141 if (atts.getLength()>0) {
142 String id = atts.getValue("gs3:id");
143 if (id != null) {
144 path += "[@gs3:id='"+id+"']";
145 }
146 }
147 return path;
148 }
149 protected String removePathLink(String path) {
150
151 int i=path.lastIndexOf('/');
152 if (i==-1) {
153 path="";
154 } else {
155 path = path.substring(0, i);
156 }
157 return path;
158 }
159 /** these are what we save on the stack */
160 private class MyDocument {
161
162 public Document doc = null;
163 public String contents = null;
164 public String tagname = "";
165
166 }
167 protected void pushOnStack() {
168 if (current_doc != null) {
169 MyDocument save = new MyDocument();
170 save.doc = current_doc;
171 save.contents = current_contents;
172 save.tagname = current_node;
173 stack.push(save);
174 }
175 current_doc = new Document();
176 current_contents = "";
177 current_node = "";
178 }
179
180 protected void popOffStack() {
181 if (!stack.empty()) {
182 MyDocument saved = (MyDocument)stack.pop();
183 current_doc = saved.doc;
184 current_contents = saved.contents;
185 current_node = saved.tagname;
186 } else {
187 current_doc = new Document();
188 current_contents = "";
189 current_node = "";
190 }
191 }
192
193 public InputSource resolveEntity (String public_id, String system_id) {
194
195 if (system_id.startsWith("file://")) {
196 return new InputSource(system_id);
197 }
198 if (!system_id.startsWith(File.separator)) {
199 system_id = base_path+File.separatorChar+system_id;
200 }
201 return new InputSource("file://"+system_id);
202 }
203
204}
205
206
Note: See TracBrowser for help on using the repository browser.