Changeset 16432
- Timestamp:
- 2008-07-16T16:55:20+12:00 (15 years ago)
- Location:
- indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper
- Files:
-
- 1 deleted
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java
r16430 r16432 30 30 import java.io.*; 31 31 import java.util.Vector; 32 33 import org.xml.sax.Attributes; 34 import org.xml.sax.helpers.DefaultHandler; 35 import org.xml.sax.InputSource; 36 import org.xml.sax.SAXException; 37 import org.xml.sax.XMLReader; 38 39 import javax.xml.parsers.SAXParser; 40 import javax.xml.parsers.SAXParserFactory; 41 42 import org.apache.lucene.document.Document; 43 import org.apache.lucene.document.Field; 44 import org.apache.lucene.index.IndexWriter; 45 import org.apache.lucene.analysis.standard.StandardAnalyzer; 46 47 import java.util.Stack; 48 import java.io.FileInputStream; 49 import java.io.File; 50 import java.io.StringReader; 51 import java.net.URL; 52 32 53 33 54 /** … … 123 144 indexer.finish(); 124 145 } 146 147 148 static public class Indexer extends DefaultHandler 149 { 150 IndexWriter writer_ = null; 151 SAXParser sax_parser_ = null; 152 String doc_tag_level_ = null; 153 154 Stack stack_ = null; 155 String path_ = ""; 156 157 Document current_doc_ = null; 158 String current_node_ = ""; 159 String indexable_current_node_ = ""; 160 String current_contents_ = ""; 161 162 protected String file_id_ = null; 163 164 /** pass in true if want to create a new index, false if want to use the existing one */ 165 public Indexer (String doc_tag_level, File index_dir, boolean create) 166 { 167 doc_tag_level_ = doc_tag_level; 168 169 try { 170 stack_ = new Stack(); 171 SAXParserFactory sax_factory = SAXParserFactory.newInstance(); 172 sax_parser_ = sax_factory.newSAXParser(); 173 174 XMLReader reader = sax_parser_.getXMLReader(); 175 reader.setFeature("http://xml.org/sax/features/validation", false); 176 177 writer_ = new IndexWriter(index_dir.getPath(), new StandardAnalyzer(), create); 178 // by default, will only index 10,000 words per document 179 // Can throw out_of_memory errors 180 writer_.setMaxFieldLength(Integer.MAX_VALUE); 181 if (create) { 182 writer_.optimize(); 183 } 184 185 } catch (Exception e) { 186 // do nothing! 187 } 188 } 189 190 /** index one document */ 191 public void index (String file_id, File file) 192 { 193 file_id_ = file_id; 194 path_ = ""; 195 String base_path = file.getPath(); 196 base_path = base_path.substring(0, base_path.lastIndexOf(File.separatorChar)); 197 198 try { 199 sax_parser_.parse(new InputSource(new FileInputStream(file)), this); 200 } 201 catch (Exception e) { 202 println("parse error:"); 203 e.printStackTrace(); 204 } 205 } 206 207 /** index one document stored as string*/ 208 public void index (String xml_text) 209 { 210 file_id_ = "<xml doc on stdin>"; 211 path_ = ""; 212 213 try { 214 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this); 215 } 216 catch (Exception e) { 217 println("parse error:"); 218 e.printStackTrace(); 219 } 220 } 221 222 public void finish() 223 { 224 /** optimise the index */ 225 try { 226 writer_.optimize(); 227 writer_.close(); 228 } 229 catch (Exception e) { 230 } 231 } 232 233 protected void print(String s) 234 { 235 System.out.print(s); 236 } 237 238 protected void println(String s) 239 { 240 System.out.println(s); 241 } 242 243 public void startDocument() throws SAXException 244 { 245 println("Starting to index " + file_id_); 246 print("["); 247 } 248 249 public void endDocument() throws SAXException 250 { 251 println("]"); 252 println("... indexing finished."); 253 } 254 255 public void startElement(String uri, String localName, String qName, Attributes atts) 256 throws SAXException 257 { 258 path_ = appendPathLink(path_, qName, atts); 259 260 if (qName.equals(doc_tag_level_)) { 261 pushOnStack(); // start new doc 262 current_node_ = qName; 263 String node_id = atts.getValue("gs2:id"); 264 265 print(" " + qName + ": " + node_id ); 266 current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED)); 267 } 268 269 if (XMLTagInfo.isIndexable(atts)) { 270 indexable_current_node_ = qName; 271 } 272 else { 273 indexable_current_node_ = ""; 274 } 275 276 } 277 278 public void endElement(String uri, String localName, String qName) throws SAXException 279 { 280 if (qName.equals(indexable_current_node_)) 281 { 282 current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES)); 283 // We only need the term vector for the TX field 284 if (!qName.equals("TX")) 285 { 286 current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO)); 287 } 288 289 current_contents_ = ""; 290 } 291 292 if (qName.equals(doc_tag_level_)) { 293 try { 294 writer_.addDocument(current_doc_); 295 } 296 catch (java.io.IOException e) { 297 e.printStackTrace(); 298 } 299 popOffStack(); // end document 300 } 301 302 path_ = removePathLink(path_); 303 } 304 305 public void characters(char ch[], int start, int length) throws SAXException 306 { 307 String data = new String(ch, start, length).trim(); 308 if (data.length() > 0 ) { 309 current_contents_ += data; 310 } 311 } 312 313 protected String appendPathLink(String path, String qName, Attributes atts) 314 { 315 316 path = path + "/"+qName; 317 if (atts.getLength()>0) { 318 String id = atts.getValue("gs2:id"); 319 if (id != null) { 320 path += "[@gs2:id='"+id+"']"; 321 } 322 else { 323 id = atts.getValue("gs3:id"); 324 if (id != null) { 325 path += "[@gs3:id='"+id+"']"; 326 } 327 } 328 } 329 return path; 330 } 331 332 protected String removePathLink(String path) 333 { 334 335 int i=path.lastIndexOf('/'); 336 if (i==-1) { 337 path=""; 338 } else { 339 path = path.substring(0, i); 340 } 341 return path; 342 } 343 344 345 /** these are what we save on the stack */ 346 private class MyDocument 347 { 348 public Document doc = null; 349 public String contents = null; 350 public String tagname = ""; 351 352 } 353 354 355 protected void pushOnStack() 356 { 357 if (current_doc_ != null) { 358 MyDocument save = new MyDocument(); 359 save.doc = current_doc_; 360 save.contents = current_contents_; 361 save.tagname = current_node_; 362 stack_.push(save); 363 } 364 current_doc_ = new Document(); 365 current_contents_ = ""; 366 current_node_ = ""; 367 } 368 369 protected void popOffStack() 370 { 371 if (!stack_.empty()) { 372 MyDocument saved = (MyDocument)stack_.pop(); 373 current_doc_ = saved.doc; 374 current_contents_ = saved.contents; 375 current_node_ = saved.tagname; 376 } else { 377 current_doc_ = new Document(); 378 current_contents_ = ""; 379 current_node_ = ""; 380 } 381 } 382 } 125 383 }
Note:
See TracChangeset
for help on using the changeset viewer.