Changeset 8440 for trunk/gsdl3/src
- Timestamp:
- 2004-11-03T15:14:05+13:00 (20 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/indexers/MGIndexer.java
r8408 r8440 23 23 import org.greenstone.gsdl3.util.GSXML; 24 24 import org.greenstone.gsdl3.util.Misc; 25 import org.greenstone.gsdl3.util.Processing; 25 26 26 27 public class MGIndexer extends AbstractIndexer … … 342 343 343 344 if (!this.firstDocument) { 344 // Send a 'CTRL-B' before the document itself345 // try {346 //this.indexerTextfeed.write(END_OF_DOCUMENT);347 345 this.indexBuffer.append(END_OF_DOCUMENT); 348 346 mgPasses.processDocument(indexBuffer.toString()); … … 350 348 351 349 } 352 // }353 // catch (IOException ex)354 // { System.out.println("Bad output on end of document" + ex);355 // ex.printStackTrace();356 // return false;357 // }358 359 350 360 351 String docText = null; … … 374 365 } 375 366 if (docText == null) { 376 System.err.println("do c text is null");367 System.err.println("dom doc or sections was null - asking for doc text"); 377 368 if (this.currentIndexField.equals("text")) { 378 docText = Character.toString(END_OF_DOCUMENT) /*+ Character.toString(END_OF_SECTION)*/ + document.getDocumentText(); 379 System.err.println("prepending EOD to doctext"); 380 369 //docText = Character.toString(END_OF_DOCUMENT) + document.getDocumentText(); 370 docText = document.getDocumentText(); 381 371 } 382 372 else { 383 373 StringBuffer textBuffer = new StringBuffer(); 384 textBuffer.append(END_OF_DOCUMENT); 385 System.err.println("* appending EOD to text"); 386 387 //textBuffer.append(END_OF_SECTION); 374 //textBuffer.append(END_OF_DOCUMENT); 388 375 List values = document.getDocumentMetadataItem("gsdl3", this.currentIndexField); 389 376 if (values != null) { … … 409 396 410 397 this.indexBuffer.append(docText); 411 //byte [] bytes = docText.getBytes();412 //int pos = 0, end = bytes.length;413 414 /*415 try {416 while (pos < end) {417 //this.indexerTextfeed.write(bytes, pos, (end - pos > 512 ? 512 : end - pos));418 this.indexBuffer.append((char [])bytes, pos, (end - pos > 512 ? 512 : end - pos));419 pos = pos + 512;420 421 try {422 while (this.indexerFeedback.available() > 0)423 { byte b[] = new byte[this.indexerFeedback.available()];424 System.out.println("Feedback of " + this.indexerFeedback.available());425 this.indexerFeedback.read(b);426 System.out.println(b);427 }428 }429 catch (IOException ex)430 { System.out.println(ex);431 }432 433 434 try {435 while (this.indexerErrors.available() > 0)436 { byte b[] = new byte[this.indexerErrors.available()];437 System.out.println("Feedback of " + this.indexerErrors.available());438 this.indexerErrors.read(b);439 System.out.println(new String(b));440 }441 }442 catch (IOException ex)443 { System.out.println(ex);444 }445 }446 }447 catch (IOException ex)448 { System.out.println("Bad output during document write " + ex + " " + pos + " " + end);449 ex.printStackTrace();450 return false;451 }452 */453 398 // remember that we're not on the first document, 454 399 this.firstDocument = false; 455 400 // assign the sequence number on the first pass only, and increment the sequence number. 456 401 if (this.pass == 0) { 457 //document.addDocumentMetadata("gsdl3", "mgseqno", "dtx."+Integer.toString(startSeqNo));458 402 document.addDocumentMetadata("gsdl3", "mgseqno", this.overallName+"."+Integer.toString(startSeqNo)); 459 //System.out.println("Assigning " + startSeqNo + " to " + document.getID());460 403 } 461 404 this.documentSeqNo += 1; 462 405 463 // try {464 // while (this.indexerErrors.available() > 0)465 // { char c = (char) this.indexerErrors.read();466 // System.out.println(c);467 // }468 // while (this.indexerFeedback.available() > 0)469 // { byte b[] = new byte[this.indexerFeedback.available()];470 // System.out.println("Feedback of " + this.indexerFeedback.available());471 // this.indexerFeedback.read(b);472 // }473 // }474 // catch (IOException ex)475 // {476 // }477 406 return true; 478 407 } … … 481 410 * Initialise the pass: open required files, check status 482 411 */ 483 public boolean startPass(int passNumber) 484 { 485 486 this.pass = passNumber; 487 this.firstDocument = true; 488 this.documentSeqNo = 1; 489 this.sectionSeqNo = 1; 490 491 this.mgPasses = new MGPassesWrapper(); 492 this.indexBuffer = new StringBuffer(); 493 int indexNo = (this.pass - 2) / 2; 494 MGIndex index = null; 495 if (this.pass >= 2) { 496 index = (MGIndex) this.indexes.get(indexNo); 497 if (index.hasError()) { 498 // an error has already occurred for this index, don't continue 499 System.out.println("pass "+this.pass+": aborted due to errors in the previous pass"); 500 return false; 501 } 502 // attempt to ensure that the text subdirectory exists 503 //this.indexDirectory = new File(outputDirectory, this.getIndexDirectory(index.getLevel(), index.getField())); 504 this.indexDirectory = new File(outputDirectory, index.getName()); 505 if (!indexDirectory.exists()) { 506 if (!indexDirectory.mkdir()) { 507 return false; 508 } 509 } 510 else if (!indexDirectory.isDirectory()) { 511 return false; 512 } 513 514 this.currentIndexLevel = index.getLevel(); 515 this.currentIndexField = index.getField(); 516 this.currentIndexName = index.getName(); 517 518 if (this.currentIndexLevel == null || this.currentIndexField == null ) { 519 System.out.println("invalid index - level or field was null"); 520 return false; 521 } 522 //if (this.currentIndexName == null || this.currentIndexName.length() == 0) { 523 // this.currentIndexName = getIndexDirectory(index.getLevel(), index.getField()); 524 // } 525 this.indexStem = this.indexDirectory.getPath() + File.separatorChar + INDEX_FILE_STEM; // TODO: modify for index 526 if (this.pass % 2 == 1) { 527 this.currentIndexName = null; // why??? 528 } 529 } 530 else { 531 532 this.currentIndexField = "text"; 533 this.currentIndexLevel = "section"; 534 this.currentIndexName = null; 535 } 536 537 // get the parameters for this execution of mg_passes 538 //String pathParams = "-f index -d " + (this.pass < 2 ? this.textDirectory.toString() : this.indexDirectory.toString()); 539 mgPasses.setFileName((this.pass < 2 ? this.textDirectory.toString() : this.indexDirectory.toString())+File.separator+ "index"); 540 if (!Misc.isWindows()) { 541 mgPasses.setBasePath("/"); 542 } 543 int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2); 544 545 mgPasses.setBufferSize(100000); 546 // try { 547 // TODO add the other options to mg passes 412 public boolean startPass(int passNumber) { 413 414 415 this.pass = passNumber; 416 this.firstDocument = true; 417 this.documentSeqNo = 1; 418 this.sectionSeqNo = 1; 419 420 this.mgPasses = new MGPassesWrapper(); 421 this.indexBuffer = new StringBuffer(); 422 int indexNo = (this.pass - 2) / 2; 423 MGIndex index = null; 424 if (this.pass >= 2) { 425 index = (MGIndex) this.indexes.get(indexNo); 426 if (index.hasError()) { 427 // an error has already occurred for this index, don't continue 428 System.out.println("pass "+this.pass+": aborted due to errors in the previous pass"); 429 return false; 430 } 431 // attempt to ensure that the text subdirectory exists 432 this.indexDirectory = new File(outputDirectory, index.getName()); 433 if (!indexDirectory.exists()) { 434 if (!indexDirectory.mkdir()) { 435 return false; 436 } 437 } 438 else if (!indexDirectory.isDirectory()) { 439 return false; 440 } 441 442 this.currentIndexLevel = index.getLevel(); 443 this.currentIndexField = index.getField(); 444 this.currentIndexName = index.getName(); 445 446 if (this.currentIndexLevel == null || this.currentIndexField == null ) { 447 System.out.println("invalid index - level or field was null"); 448 return false; 449 } 450 this.indexStem = this.indexDirectory.getPath() + File.separatorChar + INDEX_FILE_STEM; // TODO: modify for index 451 if (this.pass % 2 == 1) { 452 this.currentIndexName = null; // why??? 453 } 454 } 455 else { 456 457 this.currentIndexField = "text"; 458 this.currentIndexLevel = "section"; 459 this.currentIndexName = null; 460 } 461 462 // get the parameters for this execution of mg_passes 463 mgPasses.setFileName((this.pass < 2 ? this.textDirectory.toString() : this.indexDirectory.toString())+File.separator+ "index"); 464 if (!Misc.isWindows()) { 465 mgPasses.setBasePath("/"); 466 } 467 int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2); 468 469 mgPasses.setBufferSize(100000); 470 548 471 switch (mgPass) { 549 472 case 0: 550 //mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams + " -b 100000 -T1");473 // -b 100000 -T1 551 474 mgPasses.addPass(MGPassesWrapper.TEXT_PASS_1); 552 475 553 476 554 477 break; 555 556 557 //mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams +" -b 100000 -T2");558 478 479 case 1: 480 // -b 100000 -T2 481 mgPasses.addPass(MGPassesWrapper.TEXT_PASS_2); 559 482 break; 560 483 561 562 //mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams + " -b 100000 -2 -m 32 -s 0 -G -t 10 -N1");484 case 2: 485 // -b 100000 -2 -m 32 -s 0 -G -t 10 -N1 563 486 mgPasses.addPass(MGPassesWrapper.INDEX_PASS_1); 564 487 mgPasses.setInvfLevel(MGPassesWrapper.INVF_LEVEL_2); … … 568 491 break; 569 492 570 571 //mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams +" -b 100000 -2 -c 3 -G -t 10 -N2");493 case 3: 494 // -b 100000 -2 -c 3 -G -t 10 -N2 572 495 mgPasses.addPass(MGPassesWrapper.INDEX_PASS_2); 573 496 mgPasses.setInvfLevel(MGPassesWrapper.INVF_LEVEL_2); … … 575 498 break; 576 499 } 577 500 578 501 mgPasses.init(); 579 // this.indexerFeedback = mg_passes.getInputStream(); 580 //this.indexerErrors = mg_passes.getErrorStream(); 581 //this.indexerTextfeed = mg_passes.getOutputStream(); 582 // } 583 // catch (IOException ex) 584 // { System.out.println(ex); 585 // ex.printStackTrace(); 586 // index.setError(true); 587 // return false; 588 // } 589 // catch (InterruptedException ex) 590 // { System.out.println(ex); 591 // ex.printStackTrace(); 592 // index.setError(true); 593 // return false; 594 // } 595 System.out.println("Pass " + this.pass); 596 return true; 502 System.out.println("Pass " + this.pass); 503 return true; 597 504 } 598 505 599 public void printProcessOutput(Process p)600 throws IOException {601 BufferedReader error_stream = new BufferedReader(new InputStreamReader( p.getErrorStream(), "UTF-8" ));602 BufferedReader output_stream = new BufferedReader(new InputStreamReader( p.getInputStream(), "UTF-8" ));603 while (output_stream.ready()) {604 System.err.println("out> "+output_stream.readLine());605 }606 while (error_stream.ready()) {607 System.err.println("err> "+error_stream.readLine());608 }609 610 }611 506 /** 612 507 * Complete a pass - reset file counters, close files, etc. 613 508 */ 614 public boolean endPass(int passNumber) 615 {Process p;509 public boolean endPass(int passNumber) { 510 Process p; 616 511 617 512 int indexNo = (passNumber - 2) / 2; 618 513 MGIndex index = null; 619 if (passNumber >= 2) { 620 index = (MGIndex) this.indexes.get(indexNo); 621 } 622 try { 623 //this.indexerTextfeed.write(END_OF_DOCUMENT); 624 //this.indexerTextfeed.write(END_OF_STREAM); 625 this.indexBuffer.append(END_OF_DOCUMENT); 626 mgPasses.processDocument(indexBuffer.toString()); 627 this.indexBuffer.delete(0, this.indexBuffer.length()); 628 // while (this.indexerErrors.available() > 0) 629 // { char c = (char) this.indexerErrors.read(); 630 // System.out.print(c); 631 // } 632 // while (this.indexerFeedback.available() > 0) 633 // { byte b[] = new byte[this.indexerFeedback.available()]; 634 // System.out.print("Feedback of " + this.indexerFeedback.available()); 635 // this.indexerFeedback.read(b); 636 // } 637 638 //this.indexerTextfeed.close(); 639 Thread.sleep(1000); 640 //this.mg_passes.waitFor(); 641 } 642 // catch (IOException ex) 643 // { System.out.println(ex); 644 // } 645 catch (InterruptedException ex) 646 { System.out.println(ex); 647 } 648 // int exitValue = this.mg_passes.exitValue(); 649 mgPasses.finish(); 650 try { 651 Thread.sleep(1000); 652 } catch (Exception e) {} 653 int exitValue = 0; 654 System.out.println("Pass " + this.pass + " completed with " + exitValue); 655 if (exitValue !=0) { 656 //assume something has gone wrong, don't continue 657 if (index != null) { 658 index.setError(true); 659 return false; 660 } 661 } 662 int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2); 663 String osextra = ""; 664 if (!Misc.isWindows()) { 665 osextra = " -d / "; 666 } 667 try { 668 switch (mgPass) 669 { 670 case 0: 514 if (passNumber >= 2) { 515 index = (MGIndex) this.indexes.get(indexNo); 516 } 517 try { 518 this.indexBuffer.append(END_OF_DOCUMENT); 519 mgPasses.processDocument(indexBuffer.toString()); 520 this.indexBuffer.delete(0, this.indexBuffer.length()); 521 Thread.sleep(1000); // what for?? 522 } 523 catch (InterruptedException ex) { 524 System.out.println(ex); 525 } 526 mgPasses.finish(); 527 try { 528 Thread.sleep(1000); 529 } catch (Exception e) {} 530 531 int exit_value = 0; 532 System.out.println("Pass " + this.pass + " completed with " + exit_value); 533 if (exit_value !=0) { 534 //assume something has gone wrong, don't continue 535 if (index != null) { 536 index.setError(true); 537 return false; 538 } 539 } 540 int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2); 541 String osextra = ""; 542 if (!Misc.isWindows()) { 543 osextra = " -d / "; 544 } 545 546 switch (mgPass) { 547 548 case 0: 671 549 System.out.println("Compressing dictionary"); 672 p = Runtime.getRuntime().exec("mg_compression_dict -f " + this.textDirectory.toString()+File.separator+"index" + osextra + " -S -H -2 -k 5120"); 673 p.waitFor(); 674 printProcessOutput(p); 675 if (p.exitValue() != 0) { 676 System.out.println("Error from mg_compression_dict: " + p.exitValue()); 677 index.setError(true); 678 679 return false; 680 } 681 else { 682 System.out.println("Compressed dictionary successfully written"); 683 } 684 break; 685 550 exit_value = Processing.runProcess("mg_compression_dict -f " + this.textDirectory.toString()+File.separator+"index" + osextra + " -S -H -2 -k 5120"); 551 if (exit_value == 0) { 552 System.out.println("Compressed dictionary successfully written"); 553 } else { 554 System.err.println("Error from mg_compression_dict: " + exit_value); 555 index.setError(true); 556 557 return false; 558 } 559 break; 560 686 561 case 2: 687 562 System.out.println("Creating perfect hash"); 688 p = Runtime.getRuntime().exec("mg_perf_hash_build -f " + this.indexDirectory.toString()+File.separator+ "index"+osextra); 689 p.waitFor(); 690 printProcessOutput(p); 691 if (p.exitValue() == 0) { 692 System.out.println("Perfect hashes completed"); 563 exit_value = Processing.runProcess("mg_perf_hash_build -f " + this.indexDirectory.toString()+File.separator+ "index"+osextra); 564 if (exit_value ==0) { 565 System.out.println("Perfect hashes completed"); 693 566 } else { 694 System. out.println("Unable to build the perfect hash");567 System.err.println("Unable to build the perfect hash"); 695 568 index.setError(true); 696 569 return false; 697 570 } 698 571 break; 699 700 572 573 case 3: 701 574 System.out.println("Writing weights file"); 702 p = Runtime.getRuntime().exec("mg_weights_build -f " + this.indexStem + " -t " + this.textStem + osextra); 703 p.waitFor(); 704 printProcessOutput(p); 705 if (p.exitValue() == 0) { 706 System.out.println("Weights file successfully written"); 707 } 708 else { 709 System.out.println("Unable to create weights file"); 710 index.setError(true); 711 return false; 712 713 } 714 715 p = Runtime.getRuntime().exec("mg_invf_dict -f " + this.indexDirectory.toString()+File.separator+"index" + osextra); 716 p.waitFor(); 717 printProcessOutput(p); 718 if (p.exitValue() == 0) { 719 System.out.println("Inverted dictionary file successfully written"); 720 } 721 else { 722 System.out.println("Unable to create inverted dictionary file"); 723 index.setError(true); 724 return false; 725 726 } 727 728 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s1 -f " + this.indexDirectory.toString()+File.separator+"index"+osextra); 729 p.waitFor(); 730 printProcessOutput(p); 731 if (p.exitValue() == 0) { 732 System.out.println("Stemmed index 1 successfully written"); 733 } 734 else { 735 System.out.println("Unable to create stemmed index 1"); 736 index.setError(true); 737 return false; 738 739 } 740 741 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s2 -f " + this.indexDirectory.toString()+File.separator+"index"+osextra); 742 p.waitFor(); 743 printProcessOutput(p); 744 if (p.exitValue() == 0) { 745 System.out.println("Stemmed index 2 successfully written"); 746 } 747 else { 748 System.out.println("Unable to create stemmed index 2"); 749 index.setError(true); 750 return false; 751 } 752 753 p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s3 -f " + this.indexDirectory.toString()+File.separator+"index"+osextra); 754 p.waitFor(); 755 printProcessOutput(p); 756 if (p.exitValue() == 0) { 757 System.out.println("Stemmed index 3 successfully written"); 758 } 759 else { 760 System.out.println("Unable to create stemmed index 3"); 761 index.setError(true); 762 return false; 763 } 764 break; 765 } 766 } 767 catch (IOException ex) 768 { System.out.println(ex); 769 ex.printStackTrace(); 770 index.setError(true); 771 return false; 772 } 773 catch (InterruptedException ex) 774 { System.out.println(ex); 775 ex.printStackTrace(); 776 index.setError(true); 777 return false; 778 } 779 mgPasses = null; 780 return true; 781 } 782 575 exit_value = Processing.runProcess("mg_weights_build -f " + this.indexStem + " -t " + this.textStem + osextra); 576 if (exit_value ==0) { 577 System.out.println("Weights file successfully written"); 578 } else { 579 System.err.println("Unable to create weights file"); 580 index.setError(true); 581 return false; 582 } 583 584 System.out.println("Creating inverted dictionary"); 585 exit_value = Processing.runProcess("mg_invf_dict -f " + this.indexDirectory.toString()+File.separator+"index" + osextra); 586 if (exit_value ==0) { 587 System.out.println("Inverted dictionary file successfully written"); 588 } else { 589 System.out.println("Unable to create inverted dictionary file"); 590 index.setError(true); 591 return false; 592 } 593 594 System.out.println("Creating Stem indexes"); 595 exit_value = Processing.runProcess("mg_stem_idx -b 4096 -s1 -f " + this.indexDirectory.toString()+File.separator+"index"+osextra); 596 if (exit_value == 0) { 597 System.out.println("Stemmed index 1 successfully written"); 598 } else { 599 System.out.println("Unable to create stemmed index 1"); 600 index.setError(true); 601 return false; 602 } 603 604 exit_value = Processing.runProcess("mg_stem_idx -b 4096 -s2 -f " + this.indexDirectory.toString()+File.separator+"index"+osextra); 605 if (exit_value == 0) { 606 System.out.println("Stemmed index 2 successfully written"); 607 } else { 608 System.out.println("Unable to create stemmed index 2"); 609 index.setError(true); 610 return false; 611 } 612 exit_value = Processing.runProcess("mg_stem_idx -b 4096 -s3 -f " + this.indexDirectory.toString()+File.separator+"index"+osextra); 613 if (exit_value == 0) { 614 System.out.println("Stemmed index 3 successfully written"); 615 } else { 616 System.out.println("Unable to create stemmed index 3"); 617 index.setError(true); 618 return false; 619 } 620 621 break; 622 } // switch 623 624 mgPasses = null; 625 return true; 626 } 627 783 628 /** 784 629 * Do any tidying up … … 796 641 797 642 public boolean addServiceDescriptions(org.w3c.dom.Element service_rack_list) { 798 System.out.println("adding service description, MGIndexer");799 643 Document doc = service_rack_list.getOwnerDocument(); 800 644
Note:
See TracChangeset
for help on using the changeset viewer.