| | 146 | |
|---|
| | 147 | |
|---|
| | 148 | static public class Indexer extends DefaultHandler |
|---|
| | 149 | { |
|---|
| | 150 | IndexWriter writer_ = null; |
|---|
| | 151 | SAXParser sax_parser_ = null; |
|---|
| | 152 | String doc_tag_level_ = null; |
|---|
| | 153 | |
|---|
| | 154 | Stack stack_ = null; |
|---|
| | 155 | String path_ = ""; |
|---|
| | 156 | |
|---|
| | 157 | Document current_doc_ = null; |
|---|
| | 158 | String current_node_ = ""; |
|---|
| | 159 | String indexable_current_node_ = ""; |
|---|
| | 160 | String current_contents_ = ""; |
|---|
| | 161 | |
|---|
| | 162 | protected String file_id_ = null; |
|---|
| | 163 | |
|---|
| | 164 | /** pass in true if want to create a new index, false if want to use the existing one */ |
|---|
| | 165 | public Indexer (String doc_tag_level, File index_dir, boolean create) |
|---|
| | 166 | { |
|---|
| | 167 | doc_tag_level_ = doc_tag_level; |
|---|
| | 168 | |
|---|
| | 169 | try { |
|---|
| | 170 | stack_ = new Stack(); |
|---|
| | 171 | SAXParserFactory sax_factory = SAXParserFactory.newInstance(); |
|---|
| | 172 | sax_parser_ = sax_factory.newSAXParser(); |
|---|
| | 173 | |
|---|
| | 174 | XMLReader reader = sax_parser_.getXMLReader(); |
|---|
| | 175 | reader.setFeature("http://xml.org/sax/features/validation", false); |
|---|
| | 176 | |
|---|
| | 177 | writer_ = new IndexWriter(index_dir.getPath(), new StandardAnalyzer(), create); |
|---|
| | 178 | // by default, will only index 10,000 words per document |
|---|
| | 179 | // Can throw out_of_memory errors |
|---|
| | 180 | writer_.setMaxFieldLength(Integer.MAX_VALUE); |
|---|
| | 181 | if (create) { |
|---|
| | 182 | writer_.optimize(); |
|---|
| | 183 | } |
|---|
| | 184 | |
|---|
| | 185 | } catch (Exception e) { |
|---|
| | 186 | // do nothing! |
|---|
| | 187 | } |
|---|
| | 188 | } |
|---|
| | 189 | |
|---|
| | 190 | /** index one document */ |
|---|
| | 191 | public void index (String file_id, File file) |
|---|
| | 192 | { |
|---|
| | 193 | file_id_ = file_id; |
|---|
| | 194 | path_ = ""; |
|---|
| | 195 | String base_path = file.getPath(); |
|---|
| | 196 | base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar)); |
|---|
| | 197 | |
|---|
| | 198 | try { |
|---|
| | 199 | sax_parser_.parse(new InputSource(new FileInputStream(file)), this); |
|---|
| | 200 | } |
|---|
| | 201 | catch (Exception e) { |
|---|
| | 202 | println("parse error:"); |
|---|
| | 203 | e.printStackTrace(); |
|---|
| | 204 | } |
|---|
| | 205 | } |
|---|
| | 206 | |
|---|
| | 207 | /** index one document stored as string*/ |
|---|
| | 208 | public void index (String xml_text) |
|---|
| | 209 | { |
|---|
| | 210 | file_id_ = "<xml doc on stdin>"; |
|---|
| | 211 | path_ = ""; |
|---|
| | 212 | |
|---|
| | 213 | try { |
|---|
| | 214 | sax_parser_.parse(new InputSource(new StringReader(xml_text)), this); |
|---|
| | 215 | } |
|---|
| | 216 | catch (Exception e) { |
|---|
| | 217 | println("parse error:"); |
|---|
| | 218 | e.printStackTrace(); |
|---|
| | 219 | } |
|---|
| | 220 | } |
|---|
| | 221 | |
|---|
| | 222 | public void finish() |
|---|
| | 223 | { |
|---|
| | 224 | /** optimise the index */ |
|---|
| | 225 | try { |
|---|
| | 226 | writer_.optimize(); |
|---|
| | 227 | writer_.close(); |
|---|
| | 228 | } |
|---|
| | 229 | catch (Exception e) { |
|---|
| | 230 | } |
|---|
| | 231 | } |
|---|
| | 232 | |
|---|
| | 233 | protected void print(String s) |
|---|
| | 234 | { |
|---|
| | 235 | System.out.print(s); |
|---|
| | 236 | } |
|---|
| | 237 | |
|---|
| | 238 | protected void println(String s) |
|---|
| | 239 | { |
|---|
| | 240 | System.out.println(s); |
|---|
| | 241 | } |
|---|
| | 242 | |
|---|
| | 243 | public void startDocument() throws SAXException |
|---|
| | 244 | { |
|---|
| | 245 | println("Starting to index " + file_id_); |
|---|
| | 246 | print("["); |
|---|
| | 247 | } |
|---|
| | 248 | |
|---|
| | 249 | public void endDocument() throws SAXException |
|---|
| | 250 | { |
|---|
| | 251 | println("]"); |
|---|
| | 252 | println("... indexing finished."); |
|---|
| | 253 | } |
|---|
| | 254 | |
|---|
| | 255 | public void startElement(String uri, String localName, String qName, Attributes atts) |
|---|
| | 256 | throws SAXException |
|---|
| | 257 | { |
|---|
| | 258 | path_ = appendPathLink(path_, qName, atts); |
|---|
| | 259 | |
|---|
| | 260 | if (qName.equals(doc_tag_level_)) { |
|---|
| | 261 | pushOnStack(); // start new doc |
|---|
| | 262 | current_node_ = qName; |
|---|
| | 263 | String node_id = atts.getValue("gs2:id"); |
|---|
| | 264 | |
|---|
| | 265 | print(" " + qName + ": " + node_id ); |
|---|
| | 266 | current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED)); |
|---|
| | 267 | } |
|---|
| | 268 | |
|---|
| | 269 | if (XMLTagInfo.isIndexable(atts)) { |
|---|
| | 270 | indexable_current_node_ = qName; |
|---|
| | 271 | } |
|---|
| | 272 | else { |
|---|
| | 273 | indexable_current_node_ = ""; |
|---|
| | 274 | } |
|---|
| | 275 | |
|---|
| | 276 | } |
|---|
| | 277 | |
|---|
| | 278 | public void endElement(String uri, String localName, String qName) throws SAXException |
|---|
| | 279 | { |
|---|
| | 280 | if (qName.equals(indexable_current_node_)) |
|---|
| | 281 | { |
|---|
| | 282 | current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES)); |
|---|
| | 283 | // We only need the term vector for the TX field |
|---|
| | 284 | if (!qName.equals("TX")) |
|---|
| | 285 | { |
|---|
| | 286 | current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO)); |
|---|
| | 287 | } |
|---|
| | 288 | |
|---|
| | 289 | current_contents_ = ""; |
|---|
| | 290 | } |
|---|
| | 291 | |
|---|
| | 292 | if (qName.equals(doc_tag_level_)) { |
|---|
| | 293 | try { |
|---|
| | 294 | writer_.addDocument(current_doc_); |
|---|
| | 295 | } |
|---|
| | 296 | catch (java.io.IOException e) { |
|---|
| | 297 | e.printStackTrace(); |
|---|
| | 298 | } |
|---|
| | 299 | popOffStack(); // end document |
|---|
| | 300 | } |
|---|
| | 301 | |
|---|
| | 302 | path_ = removePathLink(path_); |
|---|
| | 303 | } |
|---|
| | 304 | |
|---|
| | 305 | public void characters(char ch[], int start, int length) throws SAXException |
|---|
| | 306 | { |
|---|
| | 307 | String data = new String(ch, start, length).trim(); |
|---|
| | 308 | if (data.length() > 0 ) { |
|---|
| | 309 | current_contents_ += data; |
|---|
| | 310 | } |
|---|
| | 311 | } |
|---|
| | 312 | |
|---|
| | 313 | protected String appendPathLink(String path, String qName, Attributes atts) |
|---|
| | 314 | { |
|---|
| | 315 | |
|---|
| | 316 | path = path + "/"+qName; |
|---|
| | 317 | if (atts.getLength()>0) { |
|---|
| | 318 | String id = atts.getValue("gs2:id"); |
|---|
| | 319 | if (id != null) { |
|---|
| | 320 | path += "[@gs2:id='"+id+"']"; |
|---|
| | 321 | } |
|---|
| | 322 | else { |
|---|
| | 323 | id = atts.getValue("gs3:id"); |
|---|
| | 324 | if (id != null) { |
|---|
| | 325 | path += "[@gs3:id='"+id+"']"; |
|---|
| | 326 | } |
|---|
| | 327 | } |
|---|
| | 328 | } |
|---|
| | 329 | return path; |
|---|
| | 330 | } |
|---|
| | 331 | |
|---|
| | 332 | protected String removePathLink(String path) |
|---|
| | 333 | { |
|---|
| | 334 | |
|---|
| | 335 | int i=path.lastIndexOf('/'); |
|---|
| | 336 | if (i==-1) { |
|---|
| | 337 | path=""; |
|---|
| | 338 | } else { |
|---|
| | 339 | path = path.substring(0, i); |
|---|
| | 340 | } |
|---|
| | 341 | return path; |
|---|
| | 342 | } |
|---|
| | 343 | |
|---|
| | 344 | |
|---|
| | 345 | /** these are what we save on the stack */ |
|---|
| | 346 | private class MyDocument |
|---|
| | 347 | { |
|---|
| | 348 | public Document doc = null; |
|---|
| | 349 | public String contents = null; |
|---|
| | 350 | public String tagname = ""; |
|---|
| | 351 | |
|---|
| | 352 | } |
|---|
| | 353 | |
|---|
| | 354 | |
|---|
| | 355 | protected void pushOnStack() |
|---|
| | 356 | { |
|---|
| | 357 | if (current_doc_ != null) { |
|---|
| | 358 | MyDocument save = new MyDocument(); |
|---|
| | 359 | save.doc = current_doc_; |
|---|
| | 360 | save.contents = current_contents_; |
|---|
| | 361 | save.tagname = current_node_; |
|---|
| | 362 | stack_.push(save); |
|---|
| | 363 | } |
|---|
| | 364 | current_doc_ = new Document(); |
|---|
| | 365 | current_contents_ = ""; |
|---|
| | 366 | current_node_ = ""; |
|---|
| | 367 | } |
|---|
| | 368 | |
|---|
| | 369 | protected void popOffStack() |
|---|
| | 370 | { |
|---|
| | 371 | if (!stack_.empty()) { |
|---|
| | 372 | MyDocument saved = (MyDocument)stack_.pop(); |
|---|
| | 373 | current_doc_ = saved.doc; |
|---|
| | 374 | current_contents_ = saved.contents; |
|---|
| | 375 | current_node_ = saved.tagname; |
|---|
| | 376 | } else { |
|---|
| | 377 | current_doc_ = new Document(); |
|---|
| | 378 | current_contents_ = ""; |
|---|
| | 379 | current_node_ = ""; |
|---|
| | 380 | } |
|---|
| | 381 | } |
|---|
| | 382 | } |
|---|