Changeset 7449
- Timestamp:
- 2004-05-26T16:03:44+12:00 (20 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/indexers/MGIndexer.java
r7311 r7449 9 9 import java.io.OutputStream; 10 10 import java.io.IOException; 11 import java.io.BufferedReader; 12 import java.io.InputStreamReader; 11 13 12 14 import org.w3c.dom.*; 15 16 import org.greenstone.mg.*; 13 17 14 18 import org.greenstone.gsdl3.gs3build.doctypes.DocumentID; … … 18 22 import org.greenstone.gsdl3.gs3build.xpointer.XPointer; 19 23 import org.greenstone.gsdl3.util.GSXML; 24 import org.greenstone.gsdl3.util.Misc; 20 25 21 26 public class MGIndexer extends AbstractIndexer … … 26 31 boolean firstDocument; 27 32 String outputDirectory; 28 InputStream indexerFeedback; 29 InputStream indexerErrors; 30 OutputStream indexerTextfeed; 31 Process mg_passes; 33 // InputStream indexerFeedback; 34 // InputStream indexerErrors; 35 //OutputStream indexerTextfeed; 36 StringBuffer indexBuffer; 37 //Process mg_passes; 32 38 File textDirectory; 33 39 File indexDirectory; … … 41 47 String currentIndexField; 42 48 43 49 MGPassesWrapper mgPasses; 50 44 51 static final char END_OF_DOCUMENT = (char) 2; 45 static final char END_OF_SECTION = (char) 3;52 static final char END_OF_SECTION = (char) 3; // actually this is end of para for mg 46 53 static final char END_OF_STREAM = (char) 4; 47 54 48 public static final String MG_INDEX_TYPE = "mg";55 public static final String MG_INDEX_TYPE = "mg"; 49 56 public static final String INDEX_FILE_STEM = "index"; 57 50 58 class MGIndex 51 59 { String name=null; 52 60 String level=null; 53 61 String field=null; 54 boolean error = false; 62 boolean error = false;// assume built until we get an error 55 63 56 64 public MGIndex(String name, String level, String field) … … 58 66 this.level = level; 59 67 this.field = field; 60 //this.error = false; // assume built until we get an error61 68 } 62 69 … … 69 76 createIndexName(); 70 77 } 71 //this.name = null; 72 //this.error = false; 73 } 78 } 74 79 75 80 public String getLevel() … … 124 129 { this.indexes = new ArrayList(); 125 130 this.overallName = name; 131 126 132 } 127 133 … … 228 234 229 235 // append an 'end of section' marker 230 textBuffer.append(END_OF_SECTION);236 //textBuffer.append(END_OF_SECTION); 231 237 this.sectionSeqNo ++; 232 238 … … 264 270 textBuffer.append(value); 265 271 if (valueIter.hasNext()) { 266 textBuffer.append(END_OF_SECTION);272 //textBuffer.append(END_OF_SECTION); 267 273 } 268 274 } … … 305 311 node = XPointer.getNextNode(node, (this.currentIndexField.equals("text") ? textBuffer : null)); 306 312 } 307 /*308 textBuffer.append(END_OF_SECTION);313 314 //textBuffer.append(END_OF_SECTION); 309 315 this.sectionSeqNo ++; 310 */316 311 317 } 312 318 return node; … … 329 335 * the body text of the document. 330 336 */ 331 public boolean indexDocument(DocumentID docID, DocumentInterface document) 332 { 333 if (this.pass == 0) { 334 document.removeAllMetadata("gsdl3", "mgseqno"); 335 } 336 337 if (!this.firstDocument) 338 { // Send a 'CTRL-B' before the document itself 339 try { 340 this.indexerTextfeed.write(END_OF_DOCUMENT); 341 } 342 catch (IOException ex) 343 { System.out.println("Bad output on end of document" + ex); 344 ex.printStackTrace(); 345 return false; 346 } 347 } 348 349 String docText = null; 350 351 int startSeqNo = this.sectionSeqNo; 352 this.sectionSeqNo ++; 353 354 Document domDocument = document.getDOMDocument(); 355 if (domDocument != null) { 356 METSStructure sections = document.getDocumentStructure().getStructure("Section"); 357 if (sections != null) { 358 docText = this.prepareDOM(document, domDocument, sections, "gsdl3"); //this.name, "gsdl3", this.field); 359 // System.out.println(docText); 360 } 361 } 362 if (docText == null) { 363 if (this.currentIndexField.equals("text")) { 364 docText = Character.toString(END_OF_DOCUMENT) + Character.toString(END_OF_SECTION) + 365 document.getDocumentText(); 366 } 367 else { 368 StringBuffer textBuffer = new StringBuffer(); 369 textBuffer.append(END_OF_DOCUMENT); 370 textBuffer.append(END_OF_SECTION); 371 List values = document.getDocumentMetadataItem("gsdl3", this.currentIndexField); 372 if (values != null) { 373 Iterator valueIter = values.iterator(); 374 while (valueIter.hasNext()) { 375 String value = valueIter.next().toString(); 337 public boolean indexDocument(DocumentID docID, DocumentInterface document) 338 { 339 if (this.pass == 0) { 340 document.removeAllMetadata("gsdl3", "mgseqno"); 341 } 342 343 if (!this.firstDocument) { 344 // Send a 'CTRL-B' before the document itself 345 // try { 346 //this.indexerTextfeed.write(END_OF_DOCUMENT); 347 this.indexBuffer.append(END_OF_DOCUMENT); 348 mgPasses.processDocument(indexBuffer.toString()); 349 this.indexBuffer.delete(0, this.indexBuffer.length()); 376 350 377 textBuffer.append(value); 378 if (valueIter.hasNext()) { 379 textBuffer.append(END_OF_SECTION); 380 // sectionSeqNo ++; 381 } 382 } 383 } 384 else { 385 textBuffer.append("No data"); 386 } 387 docText = textBuffer.toString(); 388 } 389 sectionSeqNo ++; 390 } 391 392 /* if (this.pass == 0) { 393 System.err.println(docText); 394 } 395 */ 396 397 byte [] bytes = docText.getBytes(); 398 int pos = 0, end = bytes.length; 399 400 try { 401 while (pos < end) { 402 this.indexerTextfeed.write(bytes, pos, (end - pos > 512 ? 512 : end - pos)); 351 } 352 // } 353 // catch (IOException ex) 354 // { System.out.println("Bad output on end of document" + ex); 355 // ex.printStackTrace(); 356 // return false; 357 // } 358 359 360 String docText = null; 361 362 int startSeqNo = this.sectionSeqNo; 363 this.sectionSeqNo ++; 364 365 Document domDocument = document.getDOMDocument(); 366 if (domDocument != null) { 367 System.err.println("dom doc is not null"); 368 METSStructure sections = document.getDocumentStructure().getStructure("Section"); 369 if (sections != null) { 370 System.err.println("sections are not null"); 371 docText = this.prepareDOM(document, domDocument, sections, "gsdl3"); //this.name, "gsdl3", this.field); 372 // System.out.println(docText); 373 } 374 } 375 if (docText == null) { 376 System.err.println("doc text is null"); 377 if (this.currentIndexField.equals("text")) { 378 docText = Character.toString(END_OF_DOCUMENT) /*+ Character.toString(END_OF_SECTION)*/ + document.getDocumentText(); 379 System.err.println("prepending EOD to doctext"); 380 381 } 382 else { 383 StringBuffer textBuffer = new StringBuffer(); 384 textBuffer.append(END_OF_DOCUMENT); 385 System.err.println("* appending EOD to text"); 386 387 //textBuffer.append(END_OF_SECTION); 388 List values = document.getDocumentMetadataItem("gsdl3", this.currentIndexField); 389 if (values != null) { 390 Iterator valueIter = values.iterator(); 391 while (valueIter.hasNext()) { 392 String value = valueIter.next().toString(); 393 394 textBuffer.append(value); 395 if (valueIter.hasNext()) { 396 //textBuffer.append(END_OF_SECTION); 397 // sectionSeqNo ++; 398 } 399 } 400 } 401 else { 402 textBuffer.append("No data"); 403 } 404 docText = textBuffer.toString(); 405 } 406 sectionSeqNo ++; 407 } 408 409 410 this.indexBuffer.append(docText); 411 //byte [] bytes = docText.getBytes(); 412 //int pos = 0, end = bytes.length; 413 414 /* 415 try { 416 while (pos < end) { 417 //this.indexerTextfeed.write(bytes, pos, (end - pos > 512 ? 512 : end - pos)); 418 this.indexBuffer.append((char [])bytes, pos, (end - pos > 512 ? 512 : end - pos)); 403 419 pos = pos + 512; 404 420 … … 434 450 return false; 435 451 } 436 452 */ 437 453 // remember that we're not on the first document, 438 454 this.firstDocument = false; … … 445 461 this.documentSeqNo += 1; 446 462 447 try {448 while (this.indexerErrors.available() > 0)449 { char c = (char) this.indexerErrors.read();450 System.out.println(c);451 }452 while (this.indexerFeedback.available() > 0)453 { byte b[] = new byte[this.indexerFeedback.available()];454 System.out.println("Feedback of " + this.indexerFeedback.available());455 this.indexerFeedback.read(b);456 }457 }458 catch (IOException ex)459 {460 }463 // try { 464 // while (this.indexerErrors.available() > 0) 465 // { char c = (char) this.indexerErrors.read(); 466 // System.out.println(c); 467 // } 468 // while (this.indexerFeedback.available() > 0) 469 // { byte b[] = new byte[this.indexerFeedback.available()]; 470 // System.out.println("Feedback of " + this.indexerFeedback.available()); 471 // this.indexerFeedback.read(b); 472 // } 473 // } 474 // catch (IOException ex) 475 // { 476 // } 461 477 return true; 462 478 } … … 467 483 public boolean startPass(int passNumber) 468 484 { 485 469 486 this.pass = passNumber; 470 487 this.firstDocument = true; … … 472 489 this.sectionSeqNo = 1; 473 490 491 this.mgPasses = new MGPassesWrapper(); 492 this.indexBuffer = new StringBuffer(); 474 493 int indexNo = (this.pass - 2) / 2; 475 494 MGIndex index = null; … … 517 536 518 537 // get the parameters for this execution of mg_passes 519 String pathParams = "-f index -d " + (this.pass < 2 ? this.textDirectory.toString() : this.indexDirectory.toString()); 520 538 //String pathParams = "-f index -d " + (this.pass < 2 ? this.textDirectory.toString() : this.indexDirectory.toString()); 539 mgPasses.setFileName((this.pass < 2 ? this.textDirectory.toString() : this.indexDirectory.toString())+File.separator+ "index"); 540 if (!Misc.isWindows()) { 541 mgPasses.setBasePath("/"); 542 } 521 543 int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2); 522 544 523 try { 545 // try { 546 // TODO add the other options to mg passes 524 547 switch (mgPass) { 525 case 0: 526 mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams + " -b 100000 -T1"); 548 case 0: 549 //mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams + " -b 100000 -T1"); 550 mgPasses.addPass('T', '1'); 551 552 527 553 break; 528 554 529 555 case 1: 530 mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams +" -b 100000 -T2"); 556 //mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams +" -b 100000 -T2"); 557 mgPasses.addPass('T', '2'); 531 558 break; 532 559 533 560 case 2: 534 mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams + " -b 100000 -2 -m 32 -s 0 -G -t 10 -N1"); 561 //mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams + " -b 100000 -2 -m 32 -s 0 -G -t 10 -N1"); 562 mgPasses.addPass('N', '1'); 563 mgPasses.setInvfLevel('2'); 564 mgPasses.setStemOptions("0",0); 535 565 break; 536 566 537 567 case 3: 538 mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams +" -b 100000 -2 -c 3 -G -t 10 -N2"); 568 //mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams +" -b 100000 -2 -c 3 -G -t 10 -N2"); 569 mgPasses.addPass('N', '2'); 570 mgPasses.setInvfLevel('2'); 539 571 break; 540 572 } 541 573 542 this.indexerFeedback = mg_passes.getInputStream(); 543 this.indexerErrors = mg_passes.getErrorStream(); 544 this.indexerTextfeed = mg_passes.getOutputStream(); 545 } 546 catch (IOException ex) 547 { System.out.println(ex); 548 ex.printStackTrace(); 549 index.setError(true); 550 return false; 551 } 574 mgPasses.init(); 575 // this.indexerFeedback = mg_passes.getInputStream(); 576 //this.indexerErrors = mg_passes.getErrorStream(); 577 //this.indexerTextfeed = mg_passes.getOutputStream(); 578 // } 579 // catch (IOException ex) 580 // { System.out.println(ex); 581 // ex.printStackTrace(); 582 // index.setError(true); 583 // return false; 584 // } 552 585 // catch (InterruptedException ex) 553 586 // { System.out.println(ex); … … 560 593 } 561 594 562 595 public void printProcessOutput(Process p) 596 throws IOException { 597 BufferedReader error_stream = new BufferedReader(new InputStreamReader( p.getErrorStream(), "UTF-8" )); 598 BufferedReader output_stream = new BufferedReader(new InputStreamReader( p.getInputStream(), "UTF-8" )); 599 while (output_stream.ready()) { 600 System.err.println("out> "+output_stream.readLine()); 601 } 602 while (error_stream.ready()) { 603 System.err.println("err> "+error_stream.readLine()); 604 } 605 606 } 563 607 /** 564 608 * Complete a pass - reset file counters, close files, etc. … … 573 617 } 574 618 try { 575 this.indexerTextfeed.write(END_OF_DOCUMENT); 576 this.indexerTextfeed.write(END_OF_STREAM); 577 while (this.indexerErrors.available() > 0) 578 { char c = (char) this.indexerErrors.read(); 579 System.out.print(c); 580 } 581 while (this.indexerFeedback.available() > 0) 582 { byte b[] = new byte[this.indexerFeedback.available()]; 583 System.out.print("Feedback of " + this.indexerFeedback.available()); 584 this.indexerFeedback.read(b); 585 } 586 587 this.indexerTextfeed.close(); 619 //this.indexerTextfeed.write(END_OF_DOCUMENT); 620 //this.indexerTextfeed.write(END_OF_STREAM); 621 this.indexBuffer.append(END_OF_DOCUMENT); 622 mgPasses.processDocument(indexBuffer.toString()); 623 this.indexBuffer.delete(0, this.indexBuffer.length()); 624 // while (this.indexerErrors.available() > 0) 625 // { char c = (char) this.indexerErrors.read(); 626 // System.out.print(c); 627 // } 628 // while (this.indexerFeedback.available() > 0) 629 // { byte b[] = new byte[this.indexerFeedback.available()]; 630 // System.out.print("Feedback of " + this.indexerFeedback.available()); 631 // this.indexerFeedback.read(b); 632 // } 633 634 //this.indexerTextfeed.close(); 588 635 Thread.sleep(1000); 589 this.mg_passes.waitFor();590 } 591 catch (IOException ex)592 { System.out.println(ex);593 }636 //this.mg_passes.waitFor(); 637 } 638 // catch (IOException ex) 639 // { System.out.println(ex); 640 // } 594 641 catch (InterruptedException ex) 595 642 { System.out.println(ex); 596 643 } 597 int exitValue = this.mg_passes.exitValue(); 644 // int exitValue = this.mg_passes.exitValue(); 645 mgPasses.finish(); 646 try { 647 Thread.sleep(1000); 648 } catch (Exception e) {} 649 int exitValue = 0; 598 650 System.out.println("Pass " + this.pass + " completed with " + exitValue); 599 651 if (exitValue !=0) { … … 605 657 } 606 658 int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2); 659 String osextra = ""; 660 if (!Misc.isWindows()) { 661 osextra = " -d / "; 662 } 607 663 try { 608 664 switch (mgPass) … … 610 666 case 0: 611 667 System.out.println("Compressing dictionary"); 612 p = Runtime.getRuntime().exec("mg_compression_dict -f index -d " + this.textDirectory.toString()+ " -S -H -2 -k 5120");668 p = Runtime.getRuntime().exec("mg_compression_dict -f " + this.textDirectory.toString()+File.separator+"index" + osextra + " -S -H -2 -k 5120"); 613 669 p.waitFor(); 670 printProcessOutput(p); 614 671 if (p.exitValue() != 0) { 615 672 System.out.println("Error from mg_compression_dict: " + p.exitValue()); 673 index.setError(true); 616 674 617 675 return false; … … 624 682 case 2: 625 683 System.out.println("Creating perfect hash"); 626 p = Runtime.getRuntime().exec("mg_perf_hash_build -f index -d " + this.indexDirectory.toString());684 p = Runtime.getRuntime().exec("mg_perf_hash_build -f " + this.indexDirectory.toString()+File.separator+ "index"+osextra); 627 685 p.waitFor(); 686 printProcessOutput(p); 628 687 if (p.exitValue() == 0) { 629 688 System.out.println("Perfect hashes completed"); … … 637 696 case 3: 638 697 System.out.println("Writing weights file"); 639 p = Runtime.getRuntime().exec("mg_weights_build -f " + this.indexStem + " -t " + this.textStem + " -d /");698 p = Runtime.getRuntime().exec("mg_weights_build -f " + this.indexStem + " -t " + this.textStem + osextra); 640 699 p.waitFor(); 700 printProcessOutput(p); 641 701 if (p.exitValue() == 0) { 642 702 System.out.println("Weights file successfully written"); 643 703 } 644 704 else { 645 System.out.println("Unable to create weights file " + "mg_weights_build -f " + this.indexStem + " -t " + this.textStem + " -d /");705 System.out.println("Unable to create weights file"); 646 706 index.setError(true); 647 707 return false; … … 649 709 } 650 710 651 p = Runtime.getRuntime().exec("mg_invf_dict -f index -d " + this.indexDirectory.toString());711 p = Runtime.getRuntime().exec("mg_invf_dict -f " + this.indexDirectory.toString()+File.separator+"index" + osextra); 652 712 p.waitFor(); 713 printProcessOutput(p); 653 714 if (p.exitValue() == 0) { 654 715 System.out.println("Inverted dictionary file successfully written"); … … 661 722 } 662 723 663 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s1 -f index -d " + this.indexDirectory.toString());724 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s1 -f " + this.indexDirectory.toString()+File.separator+"index"+osextra); 664 725 p.waitFor(); 726 printProcessOutput(p); 665 727 if (p.exitValue() == 0) { 666 728 System.out.println("Stemmed index 1 successfully written"); … … 673 735 } 674 736 675 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s2 -f index -d " + this.indexDirectory.toString());737 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s2 -f " + this.indexDirectory.toString()+File.separator+"index"+osextra); 676 738 p.waitFor(); 739 printProcessOutput(p); 677 740 if (p.exitValue() == 0) { 678 741 System.out.println("Stemmed index 2 successfully written"); … … 684 747 } 685 748 686 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s3 -f index -d " + this.indexDirectory.toString());749 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s3 -f " + this.indexDirectory.toString()+File.separator+"index"+osextra); 687 750 p.waitFor(); 751 printProcessOutput(p); 688 752 if (p.exitValue() == 0) { 689 753 System.out.println("Stemmed index 3 successfully written");
Note:
See TracChangeset
for help on using the changeset viewer.