- Timestamp:
- 2009-02-01T14:41:33+13:00 (15 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneIndexer.java
r18132 r18444 59 59 public class GS2LuceneIndexer { 60 60 61 protected static boolean debug = false; 62 63 protected static void debug(String message) 64 { 65 if (debug) { 66 System.err.println(message); 67 } 68 } 69 70 61 71 public static void main (String args[]) throws Exception 62 72 { … … 82 92 if (i<argc) { 83 93 verbosity = Integer.parseInt(args[i]); 84 } 94 if (verbosity>=5) { 95 debug = true; 96 } 97 } 98 } 99 else if (args[i].equals("-debug")) { 100 debug = true; 85 101 } 86 102 else { … … 128 144 while ((line = brin.readLine()) != null) { 129 145 xml_text.append(line); 130 if (line.startsWith("</Doc>")) { 146 147 debug("Got line " + line); 148 149 if (line.endsWith("</Delete>")) { 150 151 indexer.delete(xml_text.toString()); 152 xml_text = new StringBuffer(1024); 153 } 154 else if (line.startsWith("</Doc>")) { 131 155 indexer.index(xml_text.toString()); 132 156 xml_text = new StringBuffer(1024); … … 162 186 String current_contents_ = ""; 163 187 188 String mode_ = ""; 164 189 protected String file_id_ = null; 165 190 166 191 static private String[] stop_words = GS2Analyzer.STOP_WORDS; 192 167 193 168 194 /** pass in true if want to create a new index, false if want to use the existing one */ … … 198 224 public void index (String file_id, File file) 199 225 { 226 mode_ = "index"; 200 227 file_id_ = file_id; 201 228 path_ = ""; … … 215 242 public void index (String xml_text) 216 243 { 244 mode_ = "index"; 217 245 file_id_ = "<xml doc on stdin>"; 246 path_ = ""; 247 248 try { 249 sax_parser_.parse(new InputSource(new StringReader(xml_text)), this); 250 } 251 catch (Exception e) { 252 println("parse error:"); 253 e.printStackTrace(); 254 } 255 } 256 257 /** delete one document, based on doc_id in <Delete>doc_id</Delete> */ 258 public void delete(String xml_text) 259 { 260 mode_ = "delete"; 261 file_id_ = "<delete doc>"; 218 262 path_ = ""; 219 263 … … 250 294 public void startDocument() throws SAXException 251 295 { 252 println("Starting to index " + file_id_); 253 print("["); 296 if (mode_.equals("index")) { 297 println("Starting to index " + file_id_); 298 print("["); 299 } 254 300 } 255 301 256 302 public void endDocument() throws SAXException 257 303 { 258 println("]"); 259 println("... indexing finished."); 304 if (mode_.equals("index")) { 305 println("]"); 306 println("... indexing finished."); 307 } 260 308 } 261 309 … … 263 311 throws SAXException 264 312 { 265 path_ = appendPathLink(path_, qName, atts); 266 267 if (qName.equals(doc_tag_level_)) { 268 pushOnStack(); // start new doc 269 current_node_ = qName; 270 271 String node_id = atts.getValue("gs2:id"); 272 print(" " + qName + ": " + node_id ); 273 current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED)); 274 275 current_doc_oid_ = atts.getValue("gs2:docOID"); 276 current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.UN_TOKENIZED)); 277 } 278 279 if (isIndexable(atts)) { 280 indexable_current_node_ = qName; 281 } 282 else { 283 indexable_current_node_ = ""; 313 if (mode_.equals("index")) { 314 path_ = appendPathLink(path_, qName, atts); 315 316 if (qName.equals(doc_tag_level_)) { 317 pushOnStack(); // start new doc 318 current_node_ = qName; 319 320 String node_id = atts.getValue("gs2:id"); 321 print(" " + qName + ": " + node_id ); 322 current_doc_.add(new Field("nodeID", node_id, Field.Store.YES, Field.Index.UN_TOKENIZED)); 323 324 current_doc_oid_ = atts.getValue("gs2:docOID"); 325 current_doc_.add(new Field("docOID", current_doc_oid_, Field.Store.YES, Field.Index.UN_TOKENIZED)); 326 } 327 328 if (isIndexable(atts)) { 329 indexable_current_node_ = qName; 330 } 331 else { 332 indexable_current_node_ = ""; 333 } 284 334 } 285 335 } … … 300 350 public void endElement(String uri, String localName, String qName) throws SAXException 301 351 { 302 if (qName.equals(indexable_current_node_)) 303 { 304 current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES)); 305 // The byXX fields are used for sorting search results 306 // We don't want to do that for Text or AllFields fields 307 // They need to be untokenised for sorting 308 if (!qName.equals("TX") && !qName.equals("ZZ")) 309 { 310 current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO)); 311 } 312 313 current_contents_ = ""; 314 } 315 316 if (qName.equals(doc_tag_level_)) { 317 try { 318 writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_, analyzer_); 319 } 320 catch (java.io.IOException e) { 321 e.printStackTrace(); 322 } 323 popOffStack(); // end document 324 } 325 326 path_ = removePathLink(path_); 352 if (mode_.equals("index")) { 353 if (qName.equals(indexable_current_node_)) 354 { 355 current_doc_.add(new Field(qName, current_contents_, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES)); 356 // The byXX fields are used for sorting search results 357 // We don't want to do that for Text or AllFields fields 358 // They need to be untokenised for sorting 359 if (!qName.equals("TX") && !qName.equals("ZZ")) 360 { 361 current_doc_.add(new Field("by" + qName, current_contents_, Field.Store.NO, Field.Index.UN_TOKENIZED, Field.TermVector.NO)); 362 } 363 364 current_contents_ = ""; 365 } 366 367 if (qName.equals(doc_tag_level_)) { 368 try { 369 writer_.updateDocument(new Term("docOID", current_doc_oid_), current_doc_, analyzer_); 370 } 371 catch (java.io.IOException e) { 372 e.printStackTrace(); 373 } 374 popOffStack(); // end document 375 } 376 377 path_ = removePathLink(path_); 378 } 379 else if (mode_.equals("delete")) { 380 if (qName.equals("Delete")) { 381 try { 382 deleteDocument(current_contents_); 383 current_contents_ = ""; 384 } 385 catch (java.io.IOException e) { 386 e.printStackTrace(); 387 } 388 } 389 } 327 390 } 328 391 … … 404 467 } 405 468 } 469 470 471 protected void deleteDocument(String doc_id) 472 throws IOException 473 { 474 debug("GS2LuceneDelete.deleteDocument(" + doc_id + ")"); 475 debug("- Initial number of documents in index: " + writer_.docCount()); 476 writer_.deleteDocuments(new Term("docOID", doc_id)); 477 debug("- Final number of documents in index: " + writer_.docCount()); 478 } 479 480 406 481 } 407 482 }
Note:
See TracChangeset
for help on using the changeset viewer.