Changeset 6376


Ignore:
Timestamp:
2004-01-09T12:51:47+13:00 (20 years ago)
Author:
cs025
Message:

Fixed some problems in MG indexing; also changed use of mgseqno to a
by-index basis.

Location:
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/indexers
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/indexers/IndexerManager.java

    r6355 r6376  
    5353      if (document.isIndexed()) {
    5454        if (!this.indexers[i].indexDocument(document.getID(), document)) {
    55           System.out.println("Ending document");
     55          System.out.println("Ending document " + document.getID());
    5656        }
    5757       
    5858        // note any changes made to this document...
    59         if (document.isModified() || true) {
     59        if (document.isModified()) {
    6060          this.documents.modifiedDocument(document);
     61          //          System.out.println("Writing document "+document.getID());
    6162        }
     63        /**
     64        if (p == 0) {
     65          System.out.println("Writing document "+document.getID());
     66        }
     67        */
    6268      }
    6369    }
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/indexers/MGIndexer.java

    r6349 r6376  
    3434  String       textStem;
    3535  List         indexes;
     36  String       indexName;
    3637  String       level;
    3738  String       field;
     39
     40  static final char END_OF_DOCUMENT = (char) 2;
     41  static final char END_OF_SECTION  = (char) 3;
     42  static final char END_OF_STREAM   = (char) 4;
    3843
    3944  class MGIndex
     
    134139
    135140  private Node recurseDOM(DocumentInterface metsDoc, Node node,
    136               AbstractStructure structure, StringBuffer buffer,
     141              AbstractStructure structure, StringBuffer textBuffer,
     142              StringBuffer extraBuffer, String indexName,
    137143              String namespace, String field)
    138144  {
    139145    // send out the ctrl-c...if this is
    140146    if (structure.getStructureType().equals(METSDivision.DIVISION_TYPE)) {
    141       if (this.pass == 0) {
     147      if ((indexName != null) && indexName.startsWith("s")) {
    142148    METSDivision division = (METSDivision) structure;
    143149
     
    154160    }
    155161
    156     descriptive.addMetadata("gsdl3", "mgseqno", Integer.toString(this.sectionSeqNo));
    157       }
    158 
    159       buffer.append((char) 3);
    160       if (this.level != null &&
    161       this.level.equals(IndexerInterface.SECTION_LEVEL)) {
    162     buffer.append((char) 2);
     162    descriptive.addMetadata("gsdl3", "mgseqno", indexName + "." + Integer.toString(this.documentSeqNo));
     163    metsDoc.setModified(true);
     164    //  System.out.println("Assigning " + this.documentSeqNo + " to " + metsDoc.getID() + " " + division.getLabel());
     165      }
     166     
     167      // append an 'end of section' marker
     168      textBuffer.append(END_OF_SECTION);
     169
     170      // for document-level indexes, always append an 'end of document' tag at the
     171      // end of the document for each section.  Otherwise, each section is followed
     172      // by an end of document character.  This ensures that all indexes use the
     173      // same document numbering...
     174      if (this.level == null ||
     175      this.level.equals(IndexerInterface.DOCUMENT_LEVEL)) {
     176    // extraBuffer.append(END_OF_DOCUMENT);
     177      }
     178      else {
     179    textBuffer.append(END_OF_DOCUMENT);
     180    this.documentSeqNo ++;
    163181      }
    164182      this.sectionSeqNo ++;
    165     }
    166 
    167     // go through our children if required...
     183
     184      // produce the body here for metadata output of divisions - in the case of
     185      // text output, that will happen below...
     186      if (!this.field.equals("text"))
     187      { METSDescriptive descriptive;
     188   
     189    METSDivision division = (METSDivision) structure;
     190
     191    String metadataId = division.getDefaultMetadataReference();
     192       
     193    descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId);
     194    if (descriptive != null) {
     195      List values = descriptive.getMetadata(namespace, field);
     196     
     197      if (values != null) {
     198        Iterator valueIter = values.iterator();
     199        while (valueIter.hasNext()) {
     200          String value = valueIter.next().toString();
     201         
     202          textBuffer.append(value);
     203          if (valueIter.hasNext()) {
     204        textBuffer.append(END_OF_SECTION);
     205          }
     206        }
     207      }
     208    }
     209      }
     210    }
     211
     212    // go through our children as required...
    168213    Iterator children = structure.getChildIterator();
    169214    while (children.hasNext()) {
     
    174219      Node startNode = ((HTMLDocument) metsDoc).getSectionStartNode((METSDivision) child);
    175220     
    176       // while this node isn't the child's start node, produce the node text
     221      // while this node isn't the child's start node, produce the HTML node text, if
     222      // in text field mode...
    177223      if (field.equals("text")) {
    178224    while (node != startNode) {
    179       XPointer.printNode(node, buffer, false);
     225      XPointer.printNode(node, textBuffer, false);
    180226
    181227      // print buffer to node
    182       node = XPointer.getNextNode(node, (field.equals("text") ? buffer : null));
     228      node = XPointer.getNextNode(node, (field.equals("text") ? textBuffer : null));
    183229    }
    184230      }
    185231     
    186232      // recurse to child
    187       this.recurseDOM(metsDoc, node, child, buffer, namespace, field);
     233      this.recurseDOM(metsDoc, node, child, textBuffer, extraBuffer, indexName, namespace, field);
    188234    }
    189235
     
    193239      while (node != null) {
    194240    if (field.equals("text")) {
    195       XPointer.printNode(node, buffer, false);
    196     }
    197     else {
    198       METSDescriptive descriptive;
    199 
    200       if (structure.getStructureType().equals(METSDivision.DIVISION_TYPE)) {
    201         METSDivision division = (METSDivision) structure;
    202 
    203         String metadataId = division.getDefaultMetadataReference();
    204        
    205         descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId);
    206         if (descriptive != null) {
    207           List values = descriptive.getMetadata(namespace, field);
    208          
    209           Iterator valueIter = values.iterator();
    210           while (valueIter.hasNext()) {
    211         String value = valueIter.next().toString();
    212        
    213         buffer.append(value);
    214         if (valueIter.hasNext()) {
    215           buffer.append((char) 3);
    216         }
    217           }
    218         }
    219       }
    220     }
    221     node = XPointer.getNextNode(node, (field.equals("text") ? buffer : null));
    222       }
    223       buffer.append((char) 3);
     241      XPointer.printNode(node, textBuffer, false);
     242    }
     243    node = XPointer.getNextNode(node, (field.equals("text") ? textBuffer : null));
     244      }
     245      /*
     246      textBuffer.append(END_OF_SECTION);
    224247      this.sectionSeqNo ++;
     248      */
    225249    }
    226250    return node;
    227251  }
    228252
    229   private String prepareDOM(DocumentInterface metsDoc, Document document, METSStructure structure, String namespace, String field)
    230   { Node node = document.getDocumentElement();
     253  private String prepareDOM(DocumentInterface metsDoc, Document document, METSStructure structure,
     254                String indexName, String namespace, String field)
     255  { StringBuffer extraBuffer = new StringBuffer();
     256    Node node = document.getDocumentElement();
    231257    StringBuffer textBuffer = new StringBuffer();
    232    
    233     this.recurseDOM(metsDoc, node, structure, textBuffer, namespace, field);
     258
     259    this.recurseDOM(metsDoc, node, structure, textBuffer, extraBuffer, indexName, namespace, field);
     260    textBuffer.append(extraBuffer.toString());
    234261    return textBuffer.toString();
    235262  }
     
    255282    String docText = null;
    256283
     284    int startSeqNo = this.sectionSeqNo;
     285
    257286    Document domDocument = document.getDOMDocument();
    258     if (domDocument != null) {   
     287    if (domDocument != null) {
    259288      METSStructure sections = document.getDocumentStructure().getStructure("Section");
    260289      if (sections != null) {
    261     docText = this.prepareDOM(document, domDocument, sections, "gsdl3", this.field);
     290    docText = this.prepareDOM(document, domDocument, sections, this.indexName, "gsdl3", this.field);
    262291    //  System.out.println(docText);
    263292      }
    264293    }
    265294    if (docText == null) {
    266       docText = document.getDocumentText();
     295      if (this.field.equals("text")) {
     296    docText = Character.toString(END_OF_DOCUMENT) + Character.toString(END_OF_SECTION) +
     297      document.getDocumentText();
     298      }
     299      else {
     300    StringBuffer textBuffer = new StringBuffer();
     301    textBuffer.append(END_OF_DOCUMENT);
     302    textBuffer.append(END_OF_SECTION);
     303    List values = document.getDocumentMetadataItem("gsdl3", this.field);
     304    if (values != null) {
     305      Iterator valueIter = values.iterator();
     306      while (valueIter.hasNext()) {
     307        String value = valueIter.next().toString();
     308       
     309        textBuffer.append(value);
     310        if (valueIter.hasNext()) {
     311          textBuffer.append(END_OF_SECTION);
     312          sectionSeqNo ++;
     313        }
     314      }
     315    }
     316    else {
     317      textBuffer.append("No data");
     318    }
     319    docText = textBuffer.toString();
     320      }
     321      sectionSeqNo ++;
    267322    }
    268323
     
    284339    }
    285340    catch (IOException ex)
    286     {
     341        { System.out.println(ex);
    287342    }
    288343
     
    297352    }
    298353    catch (IOException ex)
    299     {
     354        { System.out.println(ex);
    300355    }
    301356      }
     
    311366    this.firstDocument = false;
    312367    if (this.pass == 0) {
    313       document.addDocumentMetadata("gsdl3", "mgseqno", Integer.toString(this.documentSeqNo));
     368      document.addDocumentMetadata("gsdl3", "mgseqno", "dtx."+Integer.toString(this.documentSeqNo));
     369      //      System.out.println("Assigning " + startSeqNo + " to " + document.getID());
    314370    }
    315371    this.documentSeqNo += 1;
    316      
     372
    317373    try {
    318374      while (this.indexerErrors.available() > 0)
     
    343399
    344400      int indexNo = (this.pass - 2) / 2;
    345       if (indexNo >= 0) {
     401      if (this.pass >= 2) {
    346402    MGIndex index = (MGIndex) this.indexes.get(indexNo);
    347403     
     
    357413    }
    358414
    359     this.indexStem = this.outputDirectory + File.separatorChar +
    360       this.getIndexDirectory(index.getLevel(), index.getField()) +
    361       File.separatorChar + "index"; // TODO: modify for index
    362415    this.level = index.getLevel();
    363416    this.field = index.getField();
     417    this.indexName = this.getIndexDirectory(index.getLevel(), index.getField());
     418    this.indexStem = this.outputDirectory + File.separatorChar +
     419      this.indexName + File.separatorChar + "index"; // TODO: modify for index
     420    if (this.pass % 2 == 1) {
     421      this.indexName = null;
     422    }
    364423      }
    365424      else {
    366425    this.field = "text";
    367       }
     426    this.level = "document";
     427    this.indexName = null;
     428      }
     429      System.out.println("level is " + this.level);
     430      System.out.println("field is " + this.field);
     431      System.out.println("index name is " + this.indexName);
    368432     
    369433      // get the parameters for this execution of mg_passes
     
    422486       
    423487      try {
    424     this.indexerTextfeed.write((char) 2);
    425     this.indexerTextfeed.write(4);
     488    this.indexerTextfeed.write(END_OF_DOCUMENT);
     489    this.indexerTextfeed.write(END_OF_STREAM);
    426490    while (this.indexerErrors.available() > 0)
    427491    { char c = (char) this.indexerErrors.read();
     
    444508      { System.out.println(ex);
    445509      }
    446       System.out.println("Completed with " + this.mg_passes.exitValue());
     510      System.out.println("Pass " + this.pass + " completed with " + this.mg_passes.exitValue());
    447511
    448512      int mgPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2);
     
    458522          System.out.println("Error from mg_compression_dict: " + p.exitValue());
    459523        }
     524        else {
     525          System.out.println("Compressed dictionary successfully written");
     526        }
    460527      break;
    461528
     
    468535        }
    469536        else {
    470           System.out.println("Unable to create weights file");
     537          System.out.println("Unable to create weights file " + "mg_weights_build -f " + this.indexStem + " -t " + this.textStem + " -d /");
    471538        }
    472539
Note: See TracChangeset for help on using the changeset viewer.