Changeset 9975 for trunk/gsdl3/src
- Timestamp:
- 2005-05-26T16:18:44+12:00 (19 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/indexers/MGIndexer.java
r9940 r9975 22 22 import org.greenstone.gsdl3.gs3build.metadata.*; 23 23 import org.greenstone.gsdl3.gs3build.xpointer.XPointer; 24 import org.greenstone.gsdl3.gs3build.util.DOMUtils; 24 25 import org.greenstone.gsdl3.util.GSXML; 25 26 import org.greenstone.gsdl3.util.Misc; 26 27 import org.greenstone.gsdl3.util.Processing; 27 28 28 public class MGIndexer extends AbstractIndexer 29 // for debug 30 import org.greenstone.gsdl3.util.XMLConverter; 31 32 public class MGIndexer //extends AbstractIndexer 33 implements IndexerInterface 29 34 { 30 35 int pass; … … 33 38 boolean firstDocument; 34 39 String outputDirectory; 35 // InputStream indexerFeedback;36 // InputStream indexerErrors;37 //OutputStream indexerTextfeed;38 40 StringBuffer indexBuffer; 39 //Process mg_passes;40 41 File textDirectory; 41 42 File indexDirectory; … … 45 46 String overallName; 46 47 47 String currentIndexName; 48 String currentIndexLevel; 49 String currentIndexField; 50 48 MGIndex current_index = null; 51 49 MGPassesWrapper mgPasses; 52 50 53 51 static final char END_OF_DOCUMENT = (char) 2; 54 static final char END_OF_SECTION = (char) 3; // actually this is end of para for mg52 //static final char END_OF_SECTION = (char) 3; // actually this is end of para for mg - don't use this yet 55 53 static final char END_OF_STREAM = (char) 4; 56 54 … … 62 60 String name=null; 63 61 String level=null; 64 String field=null;62 List fields=null; 65 63 boolean error = false;// assume built until we get an error 66 67 public MGIndex(String name, String level, String field) 64 65 public MGIndex(Element index_element) { 66 67 this.fields = new ArrayList(); 68 this.name = index_element.getAttribute(GSXML.NAME_ATT); 69 if (this.name.equals("")) { 70 // TODO make this dynamic 71 this.name = "xx"; 72 } 73 NodeList children = index_element.getChildNodes(); 74 for (int c = 0; c < children.getLength(); c ++) { 75 Node child = children.item(c); 76 77 if (child.getNodeType() == Node.ELEMENT_NODE) { 78 String name = child.getNodeName(); 79 80 if (name.equals(GSXML.LEVEL_ELEM)) { 81 this.level = DOMUtils.getNodeChildText(children.item(c)); 82 } 83 else if (name.equals(GSXML.FIELD_ELEM)) { 84 String fieldName = DOMUtils.getNodeChildText(children.item(c)); 85 this.fields.add(fieldName); 86 } 87 } 88 } 89 } 90 91 public MGIndex(String name, String level, List fields) 68 92 { 69 93 this.name = name; 70 94 this.level = level; 71 this.field = field; 72 } 73 95 this.fields = fields; 96 } 97 98 // old gs2 style config - can we get rid of this?? 74 99 public MGIndex(String indexLabel) 75 100 { 76 101 int colonAt = indexLabel.indexOf(':'); 77 102 78 if (colonAt >= 0) { 79 this.field = indexLabel.substring(colonAt+1); 80 this.level = indexLabel.substring(0, colonAt); 81 createIndexName(); 82 } 83 } 84 85 public String getLevel() 86 { 87 return this.level; 88 } 89 90 public String getField() 91 { 92 return this.field; 93 } 94 95 public String getName() 96 { 97 if (this.name==null || this.name.equals("")) { 98 createIndexName(); 99 } 100 return this.name; 101 } 102 103 public boolean hasError() { 104 return this.error; 105 } 106 public void setError(boolean b) { 107 this.error = b; 108 } 109 110 private void createIndexName() { 103 if (colonAt < 0) { 104 System.err.println("MGIndex(): invalid index specification: "+indexLabel); 105 return; 106 } 107 String field_string = indexLabel.substring(colonAt+1); 108 String [] field_list = field_string.split(","); 109 this.fields = new ArrayList(); 110 for (int i=0; i<field_list.length; i++) { 111 this.fields.add(field_list[i]); 112 } 113 this.level = indexLabel.substring(0, colonAt); 114 //createIndexName 111 115 StringBuffer new_name = new StringBuffer(); 112 116 new_name.append(Character.toLowerCase((char) this.level.charAt(0))); 113 114 117 int c, w; 115 118 w = 0; 116 119 c = 0; 117 while (c < this.field.length() && w < 2) {118 char ch = this.field.charAt(c);119 120 while (c < field_string.length() && w < 2) { 121 char ch = field_string.charAt(c); 122 120 123 ch = Character.toLowerCase(ch); 121 124 if (Character.isLetter(ch)) { … … 130 133 this.name = new_name.toString(); 131 134 } 135 136 137 public String getLevel() 138 { 139 return this.level; 140 } 141 142 public List getFields() 143 { 144 return this.fields; 145 } 146 147 public String getName() 148 { 149 // if (this.name==null || this.name.equals("")) { 150 // createIndexName(); 151 // } 152 return this.name; 153 } 154 155 public boolean hasError() { 156 return this.error; 157 } 158 public void setError(boolean b) { 159 this.error = b; 160 } 161 162 // private void createIndexName() { 163 // StringBuffer new_name = new StringBuffer(); 164 // new_name.append(Character.toLowerCase((char) this.level.charAt(0))); 165 166 // int c, w; 167 // w = 0; 168 // c = 0; 169 // String [] fields_concat = this.fields.toArray 170 // while (c < this.field.length() && w < 2) { 171 // char ch = this.field.charAt(c); 172 173 // ch = Character.toLowerCase(ch); 174 // if (Character.isLetter(ch)) { 175 // if (ch != 'a' && ch != 'e' && ch != 'i' && 176 // ch != 'o' && ch != 'u') { 177 // new_name.append(ch); 178 // w++; 179 // } 180 // } 181 // c ++; 182 // } 183 // this.name = new_name.toString(); 184 // } 185 186 132 187 } // MGIndex 133 188 … … 147 202 return this.overallName; 148 203 } 149 150 // private String getIndexDirectory(String level, String field)151 // { StringBuffer directory = new StringBuffer();152 // directory.append(Character.toLowerCase((char) level.charAt(0)));153 154 // int c, w; 155 // w = 0;156 // c = 0;157 // while (c < field.length() && w < 2) { 158 // char ch = field.charAt(c);159 160 // ch = Character.toLowerCase(ch);161 // if (Character.isLetter(ch)) { 162 // if (ch != 'a' && ch != 'e' && ch != 'i' && 163 // ch != 'o' && ch != 'u') { 164 // directory.append(ch);165 // w++; 166 // } 167 //}168 // c ++; 169 // } 170 // return directory.toString();171 //}204 205 public boolean configure(Node search_node) 206 { 207 NodeList index_children = GSXML.getChildrenByTagName(search_node, GSXML.INDEX_ELEM); 208 209 // add a text 'index' - we should be able to turn this off in the config file? actually mg needs a text index 210 ArrayList list = new ArrayList(); 211 list.add("text"); 212 MGIndex index = new MGIndex("text", "section", list); 213 indexes.add(index); 214 for (int i = 0; i < index_children.getLength(); i ++) { 215 Element index_elem = (Element)index_children.item(i); 216 index = new MGIndex(index_elem); 217 if (index.getName() != null && index.getLevel() != null && index.getFields()!= null) { 218 219 indexes.add(index); 220 } else { 221 System.err.println("invalid index spec, not including"+new XMLConverter().getPrettyString(index_elem)); 222 } 223 } 224 // TODO make sure all index names are unique 225 return true; 226 } 172 227 173 228 /** … … 206 261 } 207 262 208 public boolean addIndex(String name, String level, String field)209 {210 MGIndex index = new MGIndex(name, level, field);211 this.indexes.add(index);212 return true;213 }214 263 215 264 private Node recurseDOM(DocumentInterface metsDoc, Node node, … … 218 267 //String name, String namespace, String field) 219 268 { 269 List fields = current_index.getFields(); 220 270 // send out the ctrl-c...if this is 221 271 if (structure.getStructureType().equals(METSDivision.DIVISION_TYPE)) { 222 272 // try doing this for all index types 223 if ((this.currentIndexName != null)) { // && this.level != null && this.level.equals(IndexerInterface.SECTION_LEVEL)) { //name.startsWith("s")) { 273 // actually we should only need to do this once ???? 274 if (this.pass == 0) { 275 //if ((this.currentIndexName != null)) { // && this.level != null && this.level.equals(IndexerInterface.SECTION_LEVEL)) { //name.startsWith("s")) { 224 276 METSDivision division = (METSDivision) structure; 225 277 … … 251 303 // by an end of document character. This ensures that all indexes use the 252 304 // same document numbering... 253 if (this.currentIndexLevel == null || 254 this.currentIndexLevel.equals(IndexerInterface.DOCUMENT_LEVEL)) { 305 if (this.current_index.getLevel().equals(IndexerInterface.DOCUMENT_LEVEL)) { 255 306 extraBuffer.append(END_OF_DOCUMENT); 256 307 } … … 259 310 this.documentSeqNo ++; 260 311 } 261 312 262 313 // produce the body here for metadata output of divisions - in the case of 263 314 // text output, that will happen below... 264 if (!this.currentIndexField.equals("text")) { 315 316 if (fields.size()>1 || !((String)fields.get(0)).equals("text")) { 317 // if there is only text, don't do this 265 318 METSDescriptive descriptive; 266 319 … … 268 321 269 322 String metadataId = division.getDefaultMetadataReference(); 270 323 // are there other metadata refs to get?? 271 324 descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId); 272 325 if (descriptive != null) { 273 List values = descriptive.getMetadata(namespace, this.currentIndexField); 274 275 if (values != null) { 276 Iterator valueIter = values.iterator(); 277 while (valueIter.hasNext()) { 278 String value = valueIter.next().toString(); 279 280 textBuffer.append(value); 281 if (valueIter.hasNext()) { 282 //textBuffer.append(END_OF_SECTION); 326 for (int i=0; i<fields.size(); i++) { 327 String field = (String)fields.get(i); 328 if (field.equals("text")) { 329 continue; 330 } 331 List values = descriptive.getMetadata(namespace, field); 332 if (values != null) { 333 Iterator valueIter = values.iterator(); 334 while (valueIter.hasNext()) { 335 String value = valueIter.next().toString(); 336 textBuffer.append(value); 337 textBuffer.append(" "); 283 338 } 284 339 } … … 287 342 } 288 343 } 289 344 290 345 // go through our children as required... 291 346 Iterator children = structure.getChildIterator(); 292 347 Node startNode; 348 boolean index_text = fields.contains("text"); 293 349 while (children.hasNext()) { 294 350 AbstractStructure child = (AbstractStructure) children.next(); … … 301 357 startNode = ((HTMLDocument) metsDoc).getSectionStartNode((METSDivision) child); 302 358 } 303 //Node startNode = ((HTMLDocument) metsDoc).getSectionStartNode((METSDivision) child); 304 305 // while this node isn't the child's start node, produce the HTML node text, if 306 // in text field mode... 307 if (this.currentIndexField.equals("text")) { 359 360 // while this node isn't the child's start node, produce the 361 // HTML node text, if in text field mode... 362 if (index_text) { 308 363 while (node != startNode) { 309 364 XPointer.printNode(node, textBuffer, false); 310 311 // print buffer to node 312 node = XPointer.getNextNode(node, (this.currentIndexField.equals("text") ? textBuffer : null)); 313 } 314 } 315 365 node = XPointer.getNextNode(node); 366 } 367 } 368 316 369 // recurse to child 317 370 node = this.recurseDOM(metsDoc, node, child, textBuffer, extraBuffer, namespace); // name, namespace, field); … … 320 373 // close a document - the actual closing \B will be done by the main 321 374 // loop, so only a required \C is printed here... 375 // why have we got STRUCTURE_TYPE here and DIVISION_TYPE above???? 322 376 if (structure.getStructureType().equals(METSStructure.STRUCTURE_TYPE)) { 323 while (node != null) {324 if (this.currentIndexField.equals("text")) {377 if (index_text) { 378 while (node != null) { 325 379 XPointer.printNode(node, textBuffer, false); 326 }327 node = XPointer.getNextNode(node, (this.currentIndexField.equals("text") ? textBuffer : null));380 node = XPointer.getNextNode(node); 381 } 328 382 } 329 383 … … 354 408 public boolean indexDocument(DocumentID docID, DocumentInterface document) 355 409 { 356 if (this.pass == 0) {357 document.removeAllMetadata("gsdl3", "mgseqno");358 }359 410 360 411 if (!this.firstDocument) { … … 366 417 367 418 String docText = null; 368 369 int startSeqNo = this.sectionSeqNo; 419 // set the mgseqno if first pass 420 if (this.pass == 0) { 421 document.removeAllMetadata("gsdl3", "mgseqno"); 422 document.addDocumentMetadata("gsdl3", "mgseqno", this.overallName+"."+Integer.toString(this.sectionSeqNo)); 423 } 424 370 425 this.sectionSeqNo ++; 371 426 427 //long start = System.currentTimeMillis(); 372 428 Document domDocument = document.getDOMDocument(); 373 429 if (domDocument != null) { … … 380 436 } 381 437 } 438 //long finish = System.currentTimeMillis(); 439 //System.err.println("dom doc = "+ Long.toString(finish-start)); 440 //start = System.currentTimeMillis(); 382 441 if (docText == null) { 383 442 System.err.println("dom doc or sections was null - asking for doc text"); 384 if (this.currentIndexField.equals("text")) { 385 //docText = Character.toString(END_OF_DOCUMENT) + document.getDocumentText(); 386 docText = document.getDocumentText(); 387 } 388 else { 389 StringBuffer textBuffer = new StringBuffer(); 390 //textBuffer.append(END_OF_DOCUMENT); 391 List values = document.getDocumentMetadataItem("gsdl3", this.currentIndexField); 392 if (values != null) { 393 Iterator valueIter = values.iterator(); 394 while (valueIter.hasNext()) { 395 String value = valueIter.next().toString(); 396 397 textBuffer.append(value); 398 if (valueIter.hasNext()) { 399 //textBuffer.append(END_OF_SECTION); 400 // sectionSeqNo ++; 443 StringBuffer doc_text_buffer = new StringBuffer(); 444 List fields = this.current_index.getFields(); 445 for (int i=0; i<fields.size(); i++) { 446 String field = (String)fields.get(i); 447 if (field.equals("text")) { 448 doc_text_buffer.append(document.getDocumentText()); 449 } else { 450 // its a metadata - do namespace properly!! 451 List values = document.getDocumentMetadataItem("gsdl3", field); 452 if (values != null) { 453 Iterator valueIter = values.iterator(); 454 while (valueIter.hasNext()) { 455 String value = valueIter.next().toString(); 456 doc_text_buffer.append(value); 401 457 } 402 458 } 403 459 } 404 else { 405 textBuffer.append("No data"); 406 } 407 docText = textBuffer.toString(); 408 } 460 } // for each field 461 docText = doc_text_buffer.toString(); 409 462 sectionSeqNo ++; 410 463 } 411 464 //finish = System.currentTimeMillis(); 465 //System.err.println("whole doc = "+ Long.toString(finish-start)); 412 466 413 467 this.indexBuffer.append(docText); 414 468 // remember that we're not on the first document, 415 469 this.firstDocument = false; 416 // assign the sequence number on the first pass only, and increment the sequence number. 417 if (this.pass == 0) { 418 document.addDocumentMetadata("gsdl3", "mgseqno", this.overallName+"."+Integer.toString(startSeqNo)); 419 } 420 this.documentSeqNo += 1; 470 this.documentSeqNo ++; 421 471 422 472 return true; … … 436 486 this.mgPasses = new MGPassesWrapper(); 437 487 this.indexBuffer = new StringBuffer(); 438 int indexNo = (this.pass - 2) / 2; 439 MGIndex index = null; 440 if (this.pass >= 2) { 441 index = (MGIndex) this.indexes.get(indexNo); 442 if (index.hasError()) { 443 // an error has already occurred for this index, don't continue 444 System.out.println("pass "+this.pass+": aborted due to errors in the previous pass"); 445 return false; 446 } 447 // attempt to ensure that the text subdirectory exists 448 this.indexDirectory = new File(outputDirectory, index.getName()); 449 if (!indexDirectory.exists()) { 450 if (!indexDirectory.mkdir()) { 451 return false; 452 } 453 } 454 else if (!indexDirectory.isDirectory()) { 455 return false; 456 } 457 458 this.currentIndexLevel = index.getLevel(); 459 this.currentIndexField = index.getField(); 460 this.currentIndexName = index.getName(); 461 462 if (this.currentIndexLevel == null || this.currentIndexField == null ) { 463 System.out.println("invalid index - level or field was null"); 464 return false; 465 } 466 this.indexStem = this.indexDirectory.getPath() + File.separatorChar + INDEX_FILE_STEM; // TODO: modify for index 467 if (this.pass % 2 == 1) { 468 this.currentIndexName = null; // why??? 469 } 470 } 471 else { 472 473 this.currentIndexField = "text"; 474 this.currentIndexLevel = "section"; 475 this.currentIndexName = null; 476 } 477 478 // get the parameters for this execution of mg_passes 479 mgPasses.setFileName((this.pass < 2 ? this.textDirectory.toString() : this.indexDirectory.toString())+File.separator+ "index"); 488 int indexNo = this.pass/2; 489 this.current_index = null; 490 491 this.current_index = (MGIndex) this.indexes.get(indexNo); 492 if (this.current_index.hasError()) { 493 // an error has already occurred for this index, don't continue 494 System.out.println("pass "+this.pass+": aborted due to errors in the previous pass"); 495 return false; 496 } 497 // attempt to ensure that the text/index subdirectory exists 498 this.indexDirectory = new File(outputDirectory, current_index.getName()); 499 if (!indexDirectory.exists()) { 500 if (!indexDirectory.mkdir()) { 501 return false; 502 } 503 } 504 else if (!indexDirectory.isDirectory()) { 505 return false; 506 } 507 508 this.indexStem = this.indexDirectory.getPath() + File.separatorChar + INDEX_FILE_STEM; // TODO: modify for index 509 if (this.pass == 0) { 510 // first pass, also set up the textStem 511 this.textDirectory = this.indexDirectory; 512 this.textStem = this.indexStem; 513 } 514 mgPasses.setFileName(this.indexStem); 480 515 if (!Misc.isWindows()) { 481 516 mgPasses.setBasePath("/"); … … 516 551 517 552 mgPasses.init(); 518 System.out.println(" Pass " + this.pass);553 System.out.println("Starting Pass " + this.pass); 519 554 return true; 520 555 } … … 524 559 */ 525 560 public boolean endPass(int passNumber) { 526 Process p;527 528 int indexNo = (passNumber - 2) / 2;529 MGIndex index = null;530 if (passNumber >= 2) {531 index = (MGIndex) this.indexes.get(indexNo);532 }533 561 try { 534 562 this.indexBuffer.append(END_OF_DOCUMENT); … … 549 577 if (exit_value !=0) { 550 578 //assume something has gone wrong, don't continue 551 if (index != null) { 552 index.setError(true); 553 return false; 554 } 579 current_index.setError(true); 580 return false; 555 581 } 556 582 int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2); … … 559 585 osextra = " -d / "; 560 586 } 561 587 562 588 switch (mgPass) { 563 589 564 590 case 0: 565 591 System.out.println("Compressing dictionary"); 566 exit_value = Processing.runProcess("mg_compression_dict -f " + this.text Directory.toString()+File.separator+"index"+ osextra + " -S -H -2 -k 5120");592 exit_value = Processing.runProcess("mg_compression_dict -f " + this.textStem + osextra + " -S -H -2 -k 5120"); 567 593 if (exit_value == 0) { 568 594 System.out.println("Compressed dictionary successfully written"); 569 595 } else { 570 596 System.err.println("Error from mg_compression_dict: " + exit_value); 571 index.setError(true);597 current_index.setError(true); 572 598 573 599 return false; … … 577 603 case 2: 578 604 System.out.println("Creating perfect hash"); 579 exit_value = Processing.runProcess("mg_perf_hash_build -f " + this.index Directory.toString()+File.separator+ "index"+osextra);605 exit_value = Processing.runProcess("mg_perf_hash_build -f " + this.indexStem+osextra); 580 606 if (exit_value ==0) { 581 607 System.out.println("Perfect hashes completed"); 582 608 } else { 583 609 System.err.println("Unable to build the perfect hash"); 584 index.setError(true);610 current_index.setError(true); 585 611 return false; 586 612 } … … 594 620 } else { 595 621 System.err.println("Unable to create weights file"); 596 index.setError(true);622 current_index.setError(true); 597 623 return false; 598 624 } 599 625 600 626 System.out.println("Creating inverted dictionary"); 601 exit_value = Processing.runProcess("mg_invf_dict -f " + this.index Directory.toString()+File.separator+"index"+ osextra);627 exit_value = Processing.runProcess("mg_invf_dict -f " + this.indexStem + osextra); 602 628 if (exit_value ==0) { 603 629 System.out.println("Inverted dictionary file successfully written"); 604 630 } else { 605 631 System.out.println("Unable to create inverted dictionary file"); 606 index.setError(true);632 current_index.setError(true); 607 633 return false; 608 634 } 609 635 610 636 System.out.println("Creating Stem indexes"); 611 exit_value = Processing.runProcess("mg_stem_idx -b 4096 -s1 -f " + this.index Directory.toString()+File.separator+"index"+osextra);637 exit_value = Processing.runProcess("mg_stem_idx -b 4096 -s1 -f " + this.indexStem+osextra); 612 638 if (exit_value == 0) { 613 639 System.out.println("Stemmed index 1 successfully written"); 614 640 } else { 615 641 System.out.println("Unable to create stemmed index 1"); 616 index.setError(true);617 return false; 618 } 619 620 exit_value = Processing.runProcess("mg_stem_idx -b 4096 -s2 -f " + this.index Directory.toString()+File.separator+"index"+osextra);642 current_index.setError(true); 643 return false; 644 } 645 646 exit_value = Processing.runProcess("mg_stem_idx -b 4096 -s2 -f " + this.indexStem+osextra); 621 647 if (exit_value == 0) { 622 648 System.out.println("Stemmed index 2 successfully written"); 623 649 } else { 624 650 System.out.println("Unable to create stemmed index 2"); 625 index.setError(true);626 return false; 627 } 628 exit_value = Processing.runProcess("mg_stem_idx -b 4096 -s3 -f " + this.index Directory.toString()+File.separator+"index"+osextra);651 current_index.setError(true); 652 return false; 653 } 654 exit_value = Processing.runProcess("mg_stem_idx -b 4096 -s3 -f " + this.indexStem+osextra); 629 655 if (exit_value == 0) { 630 656 System.out.println("Stemmed index 3 successfully written"); 631 657 } else { 632 658 System.out.println("Unable to create stemmed index 3"); 633 index.setError(true);659 current_index.setError(true); 634 660 return false; 635 661 } … … 654 680 public int getNumberOfPasses() 655 681 { 656 return 2 + this.indexes.size() * 2; 682 //return 2 + this.indexes.size() * 2; 683 return this.indexes.size()*2; 657 684 } 658 685 … … 665 692 boolean found_index = false; 666 693 String def_index = ""; // the default index will just be the first one created for now. 667 for (int i=0; i<this.indexes.size(); i++) { 694 for (int i=1; i<this.indexes.size(); i++) { 695 // start at 1: 0 will be the text index 668 696 MGIndex index = (MGIndex)this.indexes.get(i); 669 697 if (!index.hasError()) { … … 688 716 base_index_name.setAttribute(GSXML.NAME_ATT, overallName); 689 717 Element index_stem = doc.createElement("indexStem"); 690 index_stem.setAttribute(GSXML.NAME_ATT, "index");718 index_stem.setAttribute(GSXML.NAME_ATT, INDEX_FILE_STEM); 691 719 692 720 Element search_service_elem = doc.createElement(GSXML.SERVICE_CLASS_ELEM);
Note:
See TracChangeset
for help on using the changeset viewer.