Changeset 7449


Ignore:
Timestamp:
2004-05-26T16:03:44+12:00 (20 years ago)
Author:
kjdon
Message:

now uses jni wrapper to mg passes instead of piping via stdin/out. also changed the path args to the auxiliary programs so that they all look for files in the same place on windows

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/indexers/MGIndexer.java

    r7311 r7449  
    99import java.io.OutputStream;
    1010import java.io.IOException;
     11import java.io.BufferedReader;
     12import java.io.InputStreamReader;
    1113
    1214import org.w3c.dom.*;
     15
     16import org.greenstone.mg.*;
    1317
    1418import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
     
    1822import org.greenstone.gsdl3.gs3build.xpointer.XPointer;
    1923import org.greenstone.gsdl3.util.GSXML;
     24import org.greenstone.gsdl3.util.Misc;
    2025
    2126public class MGIndexer extends AbstractIndexer
     
    2631  boolean      firstDocument;
    2732  String       outputDirectory;
    28   InputStream  indexerFeedback;
    29   InputStream  indexerErrors;
    30   OutputStream indexerTextfeed;
    31   Process      mg_passes;
     33//   InputStream  indexerFeedback;
     34//   InputStream  indexerErrors;
     35    //OutputStream indexerTextfeed;
     36    StringBuffer indexBuffer;
     37    //Process      mg_passes;
    3238  File         textDirectory;
    3339  File         indexDirectory;
     
    4147    String       currentIndexField;
    4248
    43 
     49    MGPassesWrapper mgPasses;
     50   
    4451  static final char END_OF_DOCUMENT = (char) 2;
    45   static final char END_OF_SECTION  = (char) 3;
     52    static final char END_OF_SECTION  = (char) 3; // actually this is end of para for mg
    4653  static final char END_OF_STREAM   = (char) 4;
    4754
    48   public static final String MG_INDEX_TYPE = "mg";
     55    public static final String MG_INDEX_TYPE = "mg";
    4956    public static final String INDEX_FILE_STEM = "index";
     57   
    5058  class MGIndex
    5159  { String name=null;
    5260    String level=null;
    5361    String field=null;
    54       boolean error = false;
     62      boolean error = false;// assume built until we get an error
    5563
    5664    public MGIndex(String name, String level, String field)
     
    5866      this.level = level;
    5967      this.field = field;
    60       //this.error = false; // assume built until we get an error
    6168    }
    6269
     
    6976    createIndexName();
    7077      }
    71       //this.name = null;
    72       //this.error = false;
    73     }
     78   }
    7479
    7580    public String getLevel()
     
    124129  { this.indexes = new ArrayList();
    125130    this.overallName = name;
     131   
    126132  }
    127133
     
    228234   
    229235    // append an 'end of section' marker
    230     textBuffer.append(END_OF_SECTION);
     236    //textBuffer.append(END_OF_SECTION);
    231237    this.sectionSeqNo ++;
    232238   
     
    264270          textBuffer.append(value);
    265271          if (valueIter.hasNext()) {
    266           textBuffer.append(END_OF_SECTION);
     272          //textBuffer.append(END_OF_SECTION);
    267273          }
    268274        }
     
    305311    node = XPointer.getNextNode(node, (this.currentIndexField.equals("text") ? textBuffer : null));
    306312      }
    307       /*
    308       textBuffer.append(END_OF_SECTION);
     313     
     314      //textBuffer.append(END_OF_SECTION);
    309315      this.sectionSeqNo ++;
    310       */
     316     
    311317    }
    312318    return node;
     
    329335   *  the body text of the document.
    330336   */
    331   public boolean indexDocument(DocumentID docID, DocumentInterface document)
    332   {
    333     if (this.pass == 0) {
    334       document.removeAllMetadata("gsdl3", "mgseqno");
    335     }
    336 
    337     if (!this.firstDocument)
    338     { // Send a 'CTRL-B' before the document itself
    339       try {
    340     this.indexerTextfeed.write(END_OF_DOCUMENT);
    341       }
    342       catch (IOException ex)
    343       { System.out.println("Bad output on end of document" + ex);
    344     ex.printStackTrace();
    345     return false;
    346       }
    347     }
    348 
    349     String docText = null;
    350 
    351     int startSeqNo = this.sectionSeqNo;
    352     this.sectionSeqNo ++;
    353 
    354     Document domDocument = document.getDOMDocument();
    355     if (domDocument != null) {
    356       METSStructure sections = document.getDocumentStructure().getStructure("Section");
    357       if (sections != null) {
    358     docText = this.prepareDOM(document, domDocument, sections, "gsdl3"); //this.name, "gsdl3", this.field);
    359     //  System.out.println(docText);
    360       }
    361     }
    362     if (docText == null) {
    363       if (this.currentIndexField.equals("text")) {
    364       docText = Character.toString(END_OF_DOCUMENT) + Character.toString(END_OF_SECTION) +
    365       document.getDocumentText();
    366       }
    367       else {
    368     StringBuffer textBuffer = new StringBuffer();
    369     textBuffer.append(END_OF_DOCUMENT);
    370     textBuffer.append(END_OF_SECTION);
    371     List values = document.getDocumentMetadataItem("gsdl3", this.currentIndexField);
    372     if (values != null) {
    373       Iterator valueIter = values.iterator();
    374       while (valueIter.hasNext()) {
    375         String value = valueIter.next().toString();
     337    public boolean indexDocument(DocumentID docID, DocumentInterface document)
     338    {
     339    if (this.pass == 0) {
     340        document.removeAllMetadata("gsdl3", "mgseqno");
     341    }
     342   
     343    if (!this.firstDocument) {
     344        // Send a 'CTRL-B' before the document itself
     345        // try {
     346        //this.indexerTextfeed.write(END_OF_DOCUMENT);
     347        this.indexBuffer.append(END_OF_DOCUMENT);
     348        mgPasses.processDocument(indexBuffer.toString());
     349        this.indexBuffer.delete(0, this.indexBuffer.length());
    376350       
    377         textBuffer.append(value);
    378         if (valueIter.hasNext()) {
    379         textBuffer.append(END_OF_SECTION);
    380           //          sectionSeqNo ++;
    381         }
    382       }
    383     }
    384     else {
    385       textBuffer.append("No data");
    386     }
    387     docText = textBuffer.toString();
    388       }
    389       sectionSeqNo ++;
    390     }
    391 
    392     /*    if (this.pass == 0) {
    393       System.err.println(docText);
    394     }
    395     */
    396 
    397     byte [] bytes = docText.getBytes();
    398     int pos = 0, end = bytes.length;
    399 
    400     try {
    401       while (pos < end) {
    402     this.indexerTextfeed.write(bytes, pos, (end - pos > 512 ? 512 : end - pos));
     351    }
     352        //   }
     353        //  catch (IOException ex)
     354        //  { System.out.println("Bad output on end of document" + ex);
     355      //    ex.printStackTrace();
     356      //    return false;
     357      //  }
     358   
     359
     360    String docText = null;
     361   
     362    int startSeqNo = this.sectionSeqNo;
     363    this.sectionSeqNo ++;
     364   
     365    Document domDocument = document.getDOMDocument();
     366    if (domDocument != null) {
     367        System.err.println("dom doc is not null");
     368        METSStructure sections = document.getDocumentStructure().getStructure("Section");
     369        if (sections != null) {
     370        System.err.println("sections are not null");
     371        docText = this.prepareDOM(document, domDocument, sections, "gsdl3"); //this.name, "gsdl3", this.field);
     372        //  System.out.println(docText);
     373        }
     374    }
     375    if (docText == null) {
     376        System.err.println("doc text is null");
     377        if (this.currentIndexField.equals("text")) {
     378        docText = Character.toString(END_OF_DOCUMENT) /*+ Character.toString(END_OF_SECTION)*/ + document.getDocumentText();
     379        System.err.println("prepending EOD to doctext");
     380   
     381        }
     382        else {
     383        StringBuffer textBuffer = new StringBuffer();
     384        textBuffer.append(END_OF_DOCUMENT);
     385        System.err.println("* appending EOD to text");
     386   
     387        //textBuffer.append(END_OF_SECTION);
     388        List values = document.getDocumentMetadataItem("gsdl3", this.currentIndexField);
     389        if (values != null) {
     390            Iterator valueIter = values.iterator();
     391            while (valueIter.hasNext()) {
     392            String value = valueIter.next().toString();
     393           
     394            textBuffer.append(value);
     395            if (valueIter.hasNext()) {
     396                //textBuffer.append(END_OF_SECTION);
     397                //        sectionSeqNo ++;
     398            }
     399            }
     400        }
     401        else {
     402            textBuffer.append("No data");
     403        }
     404        docText = textBuffer.toString();
     405        }
     406        sectionSeqNo ++;
     407    }
     408   
     409   
     410    this.indexBuffer.append(docText);
     411    //byte [] bytes = docText.getBytes();
     412    //int pos = 0, end = bytes.length;
     413   
     414    /*
     415      try {
     416      while (pos < end) {
     417      //this.indexerTextfeed.write(bytes, pos, (end - pos > 512 ? 512 : end - pos));
     418      this.indexBuffer.append((char [])bytes, pos, (end - pos > 512 ? 512 : end - pos));
    403419    pos = pos + 512;
    404420     
     
    434450      return false;
    435451    }
    436 
     452    */
    437453    // remember that we're not on the first document,
    438454    this.firstDocument = false;
     
    445461    this.documentSeqNo += 1;
    446462
    447     try {
    448       while (this.indexerErrors.available() > 0)
    449       { char c = (char) this.indexerErrors.read();
    450         System.out.println(c);
    451       }
    452       while (this.indexerFeedback.available() > 0)
    453       { byte b[] = new byte[this.indexerFeedback.available()];
    454         System.out.println("Feedback of " + this.indexerFeedback.available());
    455     this.indexerFeedback.read(b);
    456       }
    457     }
    458     catch (IOException ex)
    459     {
    460     }
     463    //    try {
     464//       while (this.indexerErrors.available() > 0)
     465//       { char c = (char) this.indexerErrors.read();
     466//         System.out.println(c);
     467//       }
     468//       while (this.indexerFeedback.available() > 0)
     469//       { byte b[] = new byte[this.indexerFeedback.available()];
     470//         System.out.println("Feedback of " + this.indexerFeedback.available());
     471// this.indexerFeedback.read(b);
     472//       }
     473//    }
     474//     catch (IOException ex)
     475//     {
     476//     }
    461477    return true;
    462478  }
     
    467483    public boolean startPass(int passNumber)
    468484    {   
     485   
    469486      this.pass = passNumber;
    470487      this.firstDocument = true;
     
    472489      this.sectionSeqNo  = 1;
    473490
     491      this.mgPasses = new MGPassesWrapper();
     492      this.indexBuffer = new StringBuffer();
    474493      int indexNo = (this.pass - 2) / 2;
    475494      MGIndex index = null;
     
    517536     
    518537      // get the parameters for this execution of mg_passes
    519       String pathParams = "-f index -d " + (this.pass < 2 ? this.textDirectory.toString() : this.indexDirectory.toString());
    520 
     538      //String pathParams = "-f index -d " + (this.pass < 2 ? this.textDirectory.toString() : this.indexDirectory.toString());
     539      mgPasses.setFileName((this.pass < 2 ? this.textDirectory.toString() : this.indexDirectory.toString())+File.separator+ "index");
     540      if (!Misc.isWindows()) {
     541      mgPasses.setBasePath("/");
     542      }
    521543      int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2);
    522544     
    523       try {
     545      // try {
     546      // TODO add the other options to mg passes
    524547    switch (mgPass) {
    525       case 0:
    526         mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams + " -b 100000 -T1");
     548    case 0:
     549          //mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams + " -b 100000 -T1");
     550        mgPasses.addPass('T', '1');
     551       
     552       
    527553        break;
    528554
    529555      case 1:
    530         mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams +" -b 100000 -T2");
     556          //mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams +" -b 100000 -T2");
     557          mgPasses.addPass('T', '2');
    531558        break;
    532559       
    533560      case 2:
    534         mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams + " -b 100000 -2 -m 32 -s 0 -G -t 10 -N1");
     561          //mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams + " -b 100000 -2 -m 32 -s 0 -G -t 10 -N1");
     562        mgPasses.addPass('N', '1');
     563        mgPasses.setInvfLevel('2');
     564        mgPasses.setStemOptions("0",0);
    535565        break;
    536566       
    537567      case 3:
    538         mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams +" -b 100000 -2 -c 3 -G -t 10 -N2");
     568          //mg_passes = Runtime.getRuntime().exec("mg_passes " + pathParams +" -b 100000 -2 -c 3 -G -t 10 -N2");
     569        mgPasses.addPass('N', '2');
     570        mgPasses.setInvfLevel('2');
    539571        break;
    540572    }
    541573       
    542     this.indexerFeedback = mg_passes.getInputStream();
    543     this.indexerErrors   = mg_passes.getErrorStream();
    544     this.indexerTextfeed = mg_passes.getOutputStream();
    545       }
    546       catch (IOException ex)
    547       { System.out.println(ex);
    548         ex.printStackTrace();
    549     index.setError(true);
    550     return false;
    551       }
     574    mgPasses.init();
     575    //  this.indexerFeedback = mg_passes.getInputStream();
     576    //this.indexerErrors   = mg_passes.getErrorStream();
     577    //this.indexerTextfeed = mg_passes.getOutputStream();
     578    // }
     579    //      catch (IOException ex)
     580    // { System.out.println(ex);
     581    // ex.printStackTrace();
     582    //  index.setError(true);
     583    //  return false;
     584    // }   
    552585      //        catch (InterruptedException ex)
    553586//        { System.out.println(ex);
     
    560593    }
    561594 
    562 
     595    public void printProcessOutput(Process p)
     596    throws IOException {
     597    BufferedReader error_stream = new BufferedReader(new InputStreamReader( p.getErrorStream(), "UTF-8" ));
     598    BufferedReader output_stream = new BufferedReader(new InputStreamReader( p.getInputStream(), "UTF-8" ));
     599    while (output_stream.ready()) {
     600        System.err.println("out> "+output_stream.readLine());
     601    }
     602    while (error_stream.ready()) {
     603        System.err.println("err> "+error_stream.readLine());
     604    }
     605   
     606    }
    563607    /**
    564608     *  Complete a pass - reset file counters, close files, etc.
     
    573617    }
    574618    try {
    575     this.indexerTextfeed.write(END_OF_DOCUMENT);
    576     this.indexerTextfeed.write(END_OF_STREAM);
    577     while (this.indexerErrors.available() > 0)
    578     { char c = (char) this.indexerErrors.read();
    579       System.out.print(c);
    580     }
    581     while (this.indexerFeedback.available() > 0)
    582     { byte b[] = new byte[this.indexerFeedback.available()];
    583       System.out.print("Feedback of " + this.indexerFeedback.available());
    584       this.indexerFeedback.read(b);
    585     }
    586 
    587     this.indexerTextfeed.close();
     619    //this.indexerTextfeed.write(END_OF_DOCUMENT);
     620    //this.indexerTextfeed.write(END_OF_STREAM);
     621    this.indexBuffer.append(END_OF_DOCUMENT);
     622    mgPasses.processDocument(indexBuffer.toString());
     623    this.indexBuffer.delete(0, this.indexBuffer.length());
     624//  while (this.indexerErrors.available() > 0)
     625//  { char c = (char) this.indexerErrors.read();
     626//    System.out.print(c);
     627//  }
     628//  while (this.indexerFeedback.available() > 0)
     629//  { byte b[] = new byte[this.indexerFeedback.available()];
     630//    System.out.print("Feedback of " + this.indexerFeedback.available());
     631//    this.indexerFeedback.read(b);
     632//  }
     633
     634    //this.indexerTextfeed.close();
    588635    Thread.sleep(1000);
    589     this.mg_passes.waitFor();
    590     }
    591     catch (IOException ex)
    592       { System.out.println(ex);
    593       }
     636    //this.mg_passes.waitFor();
     637    }
     638//     catch (IOException ex)
     639//       { System.out.println(ex);
     640//       }
    594641      catch (InterruptedException ex)
    595642      { System.out.println(ex);
    596643      }
    597     int exitValue = this.mg_passes.exitValue();
     644    //    int exitValue = this.mg_passes.exitValue();
     645    mgPasses.finish();
     646    try {
     647    Thread.sleep(1000);
     648    } catch (Exception e) {}
     649    int exitValue = 0;
    598650    System.out.println("Pass " + this.pass + " completed with " + exitValue);
    599651      if (exitValue !=0) {
     
    605657      }
    606658      int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2);
     659      String osextra = "";
     660      if (!Misc.isWindows()) {
     661      osextra = " -d / ";
     662      }
    607663      try {
    608664    switch (mgPass)
     
    610666      case 0:
    611667        System.out.println("Compressing dictionary");
    612         p = Runtime.getRuntime().exec("mg_compression_dict -f index -d " + this.textDirectory.toString() + " -S -H -2 -k 5120");
     668        p = Runtime.getRuntime().exec("mg_compression_dict -f " + this.textDirectory.toString()+File.separator+"index" + osextra + " -S -H -2 -k 5120");
    613669        p.waitFor();
     670        printProcessOutput(p);
    614671        if (p.exitValue() != 0) {
    615672          System.out.println("Error from mg_compression_dict: " + p.exitValue());
     673          index.setError(true);
    616674         
    617675          return false;
     
    624682    case 2:
    625683        System.out.println("Creating perfect hash");
    626         p = Runtime.getRuntime().exec("mg_perf_hash_build -f index -d " + this.indexDirectory.toString());
     684        p = Runtime.getRuntime().exec("mg_perf_hash_build -f " + this.indexDirectory.toString()+File.separator+ "index"+osextra);
    627685        p.waitFor();
     686        printProcessOutput(p);
    628687        if (p.exitValue() == 0) {
    629688          System.out.println("Perfect hashes completed");
     
    637696      case 3:
    638697        System.out.println("Writing weights file");
    639         p = Runtime.getRuntime().exec("mg_weights_build -f " + this.indexStem + " -t " + this.textStem + " -d /");
     698        p = Runtime.getRuntime().exec("mg_weights_build -f " + this.indexStem + " -t " + this.textStem + osextra);
    640699        p.waitFor();
     700        printProcessOutput(p);
    641701        if (p.exitValue() == 0) {
    642702          System.out.println("Weights file successfully written");
    643703        }
    644704        else {
    645           System.out.println("Unable to create weights file " + "mg_weights_build -f " + this.indexStem + " -t " + this.textStem + " -d /");
     705          System.out.println("Unable to create weights file");
    646706          index.setError(true);
    647707          return false;
     
    649709        }
    650710
    651         p = Runtime.getRuntime().exec("mg_invf_dict -f index -d " + this.indexDirectory.toString());
     711        p = Runtime.getRuntime().exec("mg_invf_dict -f " + this.indexDirectory.toString()+File.separator+"index" + osextra);
    652712        p.waitFor();
     713        printProcessOutput(p);
    653714        if (p.exitValue() == 0) {
    654715          System.out.println("Inverted dictionary file successfully written");
     
    661722        }
    662723       
    663         p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s1 -f index -d " + this.indexDirectory.toString());
     724        p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s1 -f " + this.indexDirectory.toString()+File.separator+"index"+osextra);
    664725        p.waitFor();
     726        printProcessOutput(p);
    665727        if (p.exitValue() == 0) {
    666728          System.out.println("Stemmed index 1 successfully written");
     
    673735        }
    674736
    675         p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s2 -f index -d " + this.indexDirectory.toString());
     737        p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s2 -f " + this.indexDirectory.toString()+File.separator+"index"+osextra);
    676738        p.waitFor();
     739        printProcessOutput(p);
    677740        if (p.exitValue() == 0) {
    678741          System.out.println("Stemmed index 2 successfully written");
     
    684747        }
    685748
    686         p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s3 -f index -d " + this.indexDirectory.toString());
     749        p = Runtime.getRuntime().exec("mg_stem_idx -b 4096 -s3 -f " + this.indexDirectory.toString()+File.separator+"index"+osextra);
    687750        p.waitFor();
     751        printProcessOutput(p);
    688752        if (p.exitValue() == 0) {
    689753          System.out.println("Stemmed index 3 successfully written");
Note: See TracChangeset for help on using the changeset viewer.