[8521] | 1 |
|
---|
| 2 | import org.xml.sax.Attributes;
|
---|
| 3 | import org.xml.sax.helpers.DefaultHandler;
|
---|
| 4 | import org.xml.sax.InputSource;
|
---|
| 5 | import org.xml.sax.SAXException;
|
---|
| 6 | import org.xml.sax.XMLReader;
|
---|
| 7 |
|
---|
| 8 | import javax.xml.parsers.SAXParser;
|
---|
| 9 | import javax.xml.parsers.SAXParserFactory;
|
---|
| 10 |
|
---|
| 11 | import org.apache.lucene.document.Document;
|
---|
| 12 | import org.apache.lucene.document.Field;
|
---|
| 13 | import org.apache.lucene.index.IndexWriter;
|
---|
| 14 | import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
---|
| 15 |
|
---|
| 16 | import java.util.Stack;
|
---|
| 17 | import java.io.FileInputStream;
|
---|
| 18 | import java.io.File;
|
---|
| 19 | import java.io.StringReader;
|
---|
| 20 | import java.net.URL;
|
---|
| 21 |
|
---|
| 22 |
|
---|
| 23 |
|
---|
| 24 | public class Indexer extends DefaultHandler
|
---|
| 25 | {
|
---|
| 26 | IndexWriter writer_ = null;
|
---|
| 27 | SAXParser sax_parser_ = null;
|
---|
| 28 | String doc_tag_level_ = null;
|
---|
| 29 |
|
---|
| 30 | Stack stack_ = null;
|
---|
| 31 | String path_ = "";
|
---|
| 32 |
|
---|
| 33 | Document current_doc_ = null;
|
---|
| 34 | String current_node_ = "";
|
---|
| 35 | String indexable_current_node_ = "";
|
---|
| 36 | String current_contents_ = "";
|
---|
| 37 |
|
---|
| 38 | protected String file_id_ = null;
|
---|
| 39 |
|
---|
| 40 | /** pass in true if want to create a new index, false if want to use the existing one */
|
---|
| 41 | public Indexer (String doc_tag_level, File index_dir, boolean create)
|
---|
| 42 | {
|
---|
| 43 | doc_tag_level_ = doc_tag_level;
|
---|
| 44 |
|
---|
| 45 | try {
|
---|
| 46 | stack_ = new Stack();
|
---|
| 47 | SAXParserFactory sax_factory = SAXParserFactory.newInstance();
|
---|
| 48 | sax_parser_ = sax_factory.newSAXParser();
|
---|
| 49 |
|
---|
| 50 | XMLReader reader = sax_parser_.getXMLReader();
|
---|
| 51 | reader.setFeature("http://xml.org/sax/features/validation", false);
|
---|
| 52 |
|
---|
| 53 | writer_ = new IndexWriter(index_dir.getPath(), new StandardAnalyzer(), create);
|
---|
[11245] | 54 | // by default, will only index 10,000 words per document
|
---|
| 55 | // Can throw out_of_memory errors
|
---|
| 56 | writer_.maxFieldLength = Integer.MAX_VALUE;
|
---|
[8521] | 57 | if (create) {
|
---|
| 58 | writer_.optimize();
|
---|
| 59 | }
|
---|
| 60 |
|
---|
| 61 | } catch (Exception e) {
|
---|
| 62 | // do nothing!
|
---|
| 63 | }
|
---|
| 64 | }
|
---|
| 65 |
|
---|
| 66 | /** index one document */
|
---|
| 67 | public void index (String file_id, File file)
|
---|
| 68 | {
|
---|
| 69 | file_id_ = file_id;
|
---|
| 70 | path_ = "";
|
---|
| 71 | String base_path = file.getPath();
|
---|
| 72 | base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar));
|
---|
| 73 |
|
---|
| 74 | try {
|
---|
| 75 | sax_parser_.parse(new InputSource(new FileInputStream(file)), this);
|
---|
| 76 | }
|
---|
| 77 | catch (Exception e) {
|
---|
| 78 | println("parse error:");
|
---|
| 79 | e.printStackTrace();
|
---|
| 80 | }
|
---|
| 81 | }
|
---|
| 82 |
|
---|
| 83 | /** index one document stored as string*/
|
---|
| 84 | public void index (String xml_text)
|
---|
| 85 | {
|
---|
[10164] | 86 | file_id_ = "<xml doc on stdin>";
|
---|
[8521] | 87 | path_ = "";
|
---|
| 88 |
|
---|
| 89 | try {
|
---|
| 90 | sax_parser_.parse(new InputSource(new StringReader(xml_text)), this);
|
---|
| 91 | }
|
---|
| 92 | catch (Exception e) {
|
---|
| 93 | println("parse error:");
|
---|
| 94 | e.printStackTrace();
|
---|
| 95 | }
|
---|
| 96 | }
|
---|
| 97 |
|
---|
| 98 | public void finish()
|
---|
| 99 | {
|
---|
| 100 | /** optimise the index */
|
---|
| 101 | try {
|
---|
| 102 | writer_.optimize();
|
---|
| 103 | writer_.close();
|
---|
| 104 | }
|
---|
| 105 | catch (Exception e) {
|
---|
| 106 | }
|
---|
| 107 | }
|
---|
| 108 |
|
---|
[10164] | 109 | protected void print(String s)
|
---|
| 110 | {
|
---|
| 111 | System.out.print(s);
|
---|
| 112 | }
|
---|
| 113 |
|
---|
[8521] | 114 | protected void println(String s)
|
---|
| 115 | {
|
---|
| 116 | System.out.println(s);
|
---|
| 117 | }
|
---|
| 118 |
|
---|
| 119 | public void startDocument() throws SAXException
|
---|
| 120 | {
|
---|
| 121 | println("Starting to index " + file_id_);
|
---|
[10164] | 122 | print("[");
|
---|
[8521] | 123 | }
|
---|
| 124 |
|
---|
| 125 | public void endDocument() throws SAXException
|
---|
| 126 | {
|
---|
[10164] | 127 | println("]");
|
---|
[8521] | 128 | println("... indexing finished.");
|
---|
| 129 | }
|
---|
| 130 |
|
---|
| 131 | public void startElement(String uri, String localName, String qName, Attributes atts)
|
---|
| 132 | throws SAXException
|
---|
| 133 | {
|
---|
| 134 | path_ = appendPathLink(path_, qName, atts);
|
---|
| 135 |
|
---|
| 136 | if (qName.equals(doc_tag_level_)) {
|
---|
| 137 | pushOnStack(); // start new doc
|
---|
| 138 | current_node_ = qName;
|
---|
| 139 | String node_id = atts.getValue("gs2:id");
|
---|
| 140 |
|
---|
[10164] | 141 | print(" " + qName + ": " + node_id );
|
---|
[8521] | 142 | current_doc_.add(Field.UnIndexed("nodeID", node_id));
|
---|
| 143 | }
|
---|
| 144 |
|
---|
| 145 | if (XMLTagInfo.isIndexable(atts)) {
|
---|
| 146 | indexable_current_node_ = qName;
|
---|
| 147 | }
|
---|
| 148 | else {
|
---|
| 149 | indexable_current_node_ = "";
|
---|
| 150 | }
|
---|
| 151 |
|
---|
| 152 | }
|
---|
| 153 | public void endElement(String uri, String localName, String qName) throws SAXException
|
---|
| 154 | {
|
---|
| 155 | if (qName.equals(indexable_current_node_)) {
|
---|
| 156 | current_doc_.add(Field.UnStored(qName, current_contents_));
|
---|
| 157 | current_contents_ = "";
|
---|
| 158 | }
|
---|
| 159 |
|
---|
| 160 | if (qName.equals(doc_tag_level_)) {
|
---|
| 161 | try {
|
---|
| 162 | writer_.addDocument(current_doc_);
|
---|
| 163 | }
|
---|
| 164 | catch (java.io.IOException e) {
|
---|
| 165 | e.printStackTrace();
|
---|
| 166 | }
|
---|
| 167 | popOffStack(); // end document
|
---|
| 168 | }
|
---|
| 169 |
|
---|
| 170 | path_ = removePathLink(path_);
|
---|
| 171 | }
|
---|
| 172 |
|
---|
| 173 | public void characters(char ch[], int start, int length) throws SAXException
|
---|
| 174 | {
|
---|
| 175 | String data = new String(ch, start, length).trim();
|
---|
| 176 | if (data.length() > 0 ) {
|
---|
| 177 | current_contents_ += data;
|
---|
| 178 | }
|
---|
| 179 | }
|
---|
| 180 |
|
---|
| 181 | protected String appendPathLink(String path, String qName, Attributes atts)
|
---|
| 182 | {
|
---|
| 183 |
|
---|
| 184 | path = path + "/"+qName;
|
---|
| 185 | if (atts.getLength()>0) {
|
---|
| 186 | String id = atts.getValue("gs2:id");
|
---|
| 187 | if (id != null) {
|
---|
| 188 | path += "[@gs2:id='"+id+"']";
|
---|
| 189 | }
|
---|
| 190 | else {
|
---|
| 191 | id = atts.getValue("gs3:id");
|
---|
| 192 | if (id != null) {
|
---|
| 193 | path += "[@gs3:id='"+id+"']";
|
---|
| 194 | }
|
---|
| 195 | }
|
---|
| 196 | }
|
---|
| 197 | return path;
|
---|
| 198 | }
|
---|
| 199 | protected String removePathLink(String path)
|
---|
| 200 | {
|
---|
| 201 |
|
---|
| 202 | int i=path.lastIndexOf('/');
|
---|
| 203 | if (i==-1) {
|
---|
| 204 | path="";
|
---|
| 205 | } else {
|
---|
| 206 | path = path.substring(0, i);
|
---|
| 207 | }
|
---|
| 208 | return path;
|
---|
| 209 | }
|
---|
| 210 | /** these are what we save on the stack */
|
---|
| 211 | private class MyDocument
|
---|
| 212 | {
|
---|
| 213 | public Document doc = null;
|
---|
| 214 | public String contents = null;
|
---|
| 215 | public String tagname = "";
|
---|
| 216 |
|
---|
| 217 | }
|
---|
| 218 |
|
---|
| 219 | protected void pushOnStack()
|
---|
| 220 | {
|
---|
| 221 | if (current_doc_ != null) {
|
---|
| 222 | MyDocument save = new MyDocument();
|
---|
| 223 | save.doc = current_doc_;
|
---|
| 224 | save.contents = current_contents_;
|
---|
| 225 | save.tagname = current_node_;
|
---|
| 226 | stack_.push(save);
|
---|
| 227 | }
|
---|
| 228 | current_doc_ = new Document();
|
---|
| 229 | current_contents_ = "";
|
---|
| 230 | current_node_ = "";
|
---|
| 231 | }
|
---|
| 232 |
|
---|
| 233 | protected void popOffStack()
|
---|
| 234 | {
|
---|
| 235 | if (!stack_.empty()) {
|
---|
| 236 | MyDocument saved = (MyDocument)stack_.pop();
|
---|
| 237 | current_doc_ = saved.doc;
|
---|
| 238 | current_contents_ = saved.contents;
|
---|
| 239 | current_node_ = saved.tagname;
|
---|
| 240 | } else {
|
---|
| 241 | current_doc_ = new Document();
|
---|
| 242 | current_contents_ = "";
|
---|
| 243 | current_node_ = "";
|
---|
| 244 | }
|
---|
| 245 | }
|
---|
| 246 |
|
---|
| 247 |
|
---|
| 248 | }
|
---|
| 249 |
|
---|
| 250 |
|
---|