source: main/trunk/greenstone3/web/sites/localsite/collect/gberg/java/Indexer.java@ 32485

Last change on this file since 32485 was 32485, checked in by kjdon, 6 years ago

greenstone now uses lucene 4.7.2, so upgrading this code to match

  • Property svn:keywords set to Author Date Id Revision
File size: 6.3 KB
Line 
1import org.greenstone.gsdl3.util.GSEntityResolver;
2import org.greenstone.LuceneWrapper4.GSLuceneUtil;
3import org.greenstone.LuceneWrapper4.GSLuceneConstants;
4
5import org.xml.sax.Attributes;
6import org.xml.sax.helpers.DefaultHandler;
7import org.xml.sax.InputSource;
8import org.xml.sax.SAXException;
9//import org.xml.sax.SAXParseException;
10import javax.xml.parsers.SAXParser;
11import javax.xml.parsers.SAXParserFactory;
12
13
14import org.apache.lucene.analysis.Analyzer;
15import org.apache.lucene.analysis.miscellaneous.LimitTokenCountAnalyzer;
16import org.apache.lucene.index.IndexWriter;
17import org.apache.lucene.index.IndexWriterConfig;
18import org.apache.lucene.store.FSDirectory;
19import org.apache.lucene.util.Version;
20
21import org.apache.lucene.document.Document;
22import org.apache.lucene.document.Field;
23import org.apache.lucene.document.StoredField;
24import org.apache.lucene.document.TextField;
25//import org.apache.lucene.document.DateField;
26import org.apache.lucene.index.IndexWriter;
27import org.apache.lucene.analysis.standard.StandardAnalyzer;
28import org.apache.lucene.util.Version;
29//import org.apache.lucene.analysis.SimpleAnalyzer;
30
31import java.util.Stack;
32import java.io.FileInputStream;
33import java.io.File;
34import java.net.URL;
35
36
37
38public class Indexer extends DefaultHandler {
39 IndexWriter writer = null;
40 SAXParser sax_parser = null;
41 Stack stack = null;
42 String path = "";
43 String current_node = "";
44 String current_contents = "";
45 Document current_doc = null;
46 String scope = "";
47 protected String file_id = null;
48 private String base_path = null;
49 /** pass in true if want to create a new index, false if want to use the existing one */
50 public Indexer (File index_dir, boolean create) {
51 try {
52 stack = new Stack();
53 SAXParserFactory sax_factory = SAXParserFactory.newInstance();
54 sax_parser = sax_factory.newSAXParser();
55 Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
56 Analyzer ltcAn = new LimitTokenCountAnalyzer(analyzer,Integer.MAX_VALUE);
57
58 IndexWriterConfig.OpenMode open_mode;
59 if (create) {
60 open_mode = IndexWriterConfig.OpenMode.CREATE;
61 } else {
62 open_mode = IndexWriterConfig.OpenMode.APPEND;
63 }
64 IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_47, ltcAn);
65 indexWriterConfig.setOpenMode(open_mode);
66
67 FSDirectory index_fs_dir = FSDirectory.open(index_dir);
68 writer = new IndexWriter(index_fs_dir, indexWriterConfig);
69
70 } catch (Exception e) {
71
72 }
73 }
74
75 /** index one document */
76 public void index (String file_id, File file) {
77 this.file_id = file_id;
78 this.path = "";
79 this.base_path = file.getPath();
80 this.base_path = this.base_path.substring(0, this.base_path.lastIndexOf(File.separatorChar));
81 try {
82 sax_parser.parse(new InputSource(new FileInputStream(file)), this);
83 }
84 catch (Exception e) {
85 println("parse error:");
86 e.printStackTrace();
87 }
88 }
89
90 /** optimise the index */
91 public void finish() {
92 try {
93 writer.close();
94 } catch (Exception e) {}
95 }
96
97 protected void println(String s) { System.out.println(s); }
98
99 public void startDocument() throws SAXException {
100 println("Starting to index " + file_id);
101 }
102 public void endDocument() throws SAXException {
103 println("... indexing finished.");
104 }
105 public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException {
106 path = appendPathLink(path, qName, atts);
107 if (XMLTagInfo.isScopable(qName)) {
108 scope = qName;
109 }
110 if (XMLTagInfo.isIndexable(qName)) {
111 pushOnStack();
112 current_node = qName;
113 System.out.println("going to index "+qName );
114 String node_id = "";
115 String id = "<"+qName;
116 for (int i=0; i<atts.getLength(); i++) {
117 String name = atts.getQName(i);
118 String value = atts.getValue(i);
119 if (name!=null && value != null) {
120 id += " "+name+"="+value;
121 }
122 if (name.equals("gs3:id")) {
123 node_id = value;
124 }
125 }
126 id += "/>";
127
128 String value;
129 if (scope.equals(qName)) {
130 value = this.file_id+"."+qName;
131 } else {
132 value = this.file_id+"."+scope+"."+qName+"."+node_id;
133 }
134 current_doc.add(new StoredField("nodeID", value));
135
136 }
137 }
138 public void endElement(String uri, String localName, String qName) throws SAXException {
139 if (XMLTagInfo.isIndexable(qName) && qName.equals(current_node)) {
140 current_doc.add(new TextField("content", current_contents, Field.Store.NO));
141 try {
142 writer.addDocument(current_doc);
143 } catch (java.io.IOException e) {
144 e.printStackTrace();
145 }
146 popOffStack();
147 }
148
149 path = removePathLink(path);
150 }
151
152 public void characters(char ch[], int start, int length) throws SAXException {
153 String data = new String(ch, start, length).trim();
154 if (data.length() > 0 ) {
155 current_contents += data;
156 }
157 }
158
159 protected String appendPathLink(String path, String qName, Attributes atts) {
160
161 path = path + "/"+qName;
162 if (atts.getLength()>0) {
163 String id = atts.getValue("gs3:id");
164 if (id != null) {
165 path += "[@gs3:id='"+id+"']";
166 }
167 }
168 return path;
169 }
170 protected String removePathLink(String path) {
171
172 int i=path.lastIndexOf('/');
173 if (i==-1) {
174 path="";
175 } else {
176 path = path.substring(0, i);
177 }
178 return path;
179 }
180 /** these are what we save on the stack */
181 private class MyDocument {
182
183 public Document doc = null;
184 public String contents = null;
185 public String tagname = "";
186
187 }
188 protected void pushOnStack() {
189 if (current_doc != null) {
190 MyDocument save = new MyDocument();
191 save.doc = current_doc;
192 save.contents = current_contents;
193 save.tagname = current_node;
194 stack.push(save);
195 }
196 current_doc = new Document();
197 current_contents = "";
198 current_node = "";
199 }
200
201 protected void popOffStack() {
202 if (!stack.empty()) {
203 MyDocument saved = (MyDocument)stack.pop();
204 current_doc = saved.doc;
205 current_contents = saved.contents;
206 current_node = saved.tagname;
207 } else {
208 current_doc = new Document();
209 current_contents = "";
210 current_node = "";
211 }
212 }
213
214 public InputSource resolveEntity (String public_id, String system_id) {
215
216 if (system_id.startsWith("file://")) {
217 return new InputSource(system_id);
218 }
219 if (!system_id.startsWith(File.separator)) {
220 system_id = base_path+File.separatorChar+system_id;
221 }
222 return new InputSource("file://"+system_id);
223 }
224
225}
226
227
Note: See TracBrowser for help on using the repository browser.