Changeset 12191


Ignore:
Timestamp:
2006-07-13T10:29:55+12:00 (16 years ago)
Author:
kjdon
Message:

committed some changes that I had made ages ago. Not sure if it still compiles - I need to write an ant build file for this, and check compilation. Will do it once I need to - its unclear whether anyone will ever use this again

Location:
trunk/greenstone3-extensions/gs3build/src/org/greenstone/gsdl3/gs3build
Files:
1 added
7 edited

Legend:

Unmodified
Added
Removed
  • trunk/greenstone3-extensions/gs3build/src/org/greenstone/gsdl3/gs3build/CollectionManager.java

    r12188 r12191  
    6262  CollectionMetadata metadata;       // collection-level metadata
    6363  GS3SQLConnection   database;       // the database to store everything in
    64   String             collectionHome;
    65   String             siteHome;
    66   String             collectionName;
    67   String             qualifiedCollectionName; // used as the database name
     64  public String             collectionHome;
     65  public String             siteHome;
     66  public String             collectionName;
     67  public String             qualifiedCollectionName; // used as the database name
    6868  String             notifyHost;
    6969
     
    341341        RecogniserInterface ri = this.buildManager.getRecogniserManager().addRecogniser(type);
    342342        if (ri != null) {
     343            ri.setCollectionManager(this);
    343344            ri.configure(doc_type);
    344345        }
  • trunk/greenstone3-extensions/gs3build/src/org/greenstone/gsdl3/gs3build/doctypes/AbstractDocument.java

    r12188 r12191  
    419419  }
    420420 
     421    public int getNumSections() {
     422    return 3;
     423    /*  String query = "SELECT count(*) FROM divisions WHERE DocID="+this.id+" AND ParentType='Division'";
     424    try {
     425        Statement statement = connection.createStatement();
     426        ResultSet results = statement.executeQuery(query);
     427       
     428        if (results.first()) {
     429        int count = results.getInt(0);
     430        System.err.println("count = "+count);
     431        return count;
     432        }
     433    } catch (Exception e) {
     434        System.err.println("AbstractDocument.getNumSections(): "+e);
     435    }
     436    return -1;*/
     437    }
    421438  /**
    422439   *  @see DocumentInterface:isMETSCompatible
  • trunk/greenstone3-extensions/gs3build/src/org/greenstone/gsdl3/gs3build/doctypes/AbstractRecogniser.java

    r12188 r12191  
    55import org.w3c.dom.Element;
    66
     7import org.greenstone.gsdl3.gs3build.CollectionManager;
    78import org.greenstone.gsdl3.gs3build.metadata.*;
    89import org.greenstone.gsdl3.gs3build.util.HTTPTools;
     
    2223    ArrayList filename_extensions = null;
    2324    String document_type = "SET THIS IN THE CONCRETE CLASS";
    24    
     25    CollectionManager coll_manager = null;
     26
    2527    /** The constructor should set the variables
    2628     * preferredMimeType, filename_extensions and documentType
     
    3032    }
    3133   
     34    /** set the collection manager */
     35    public void setCollectionManager(CollectionManager coll_man) {
     36    this.coll_manager = coll_man;
     37    }
    3238    /** configure by default does nothing */
    3339    public boolean configure(Element config_elem){
  • trunk/greenstone3-extensions/gs3build/src/org/greenstone/gsdl3/gs3build/doctypes/METSDocument.java

    r12188 r12191  
    2626    public static final String METS_DOCUMENT_TYPE = "METS";
    2727    Document domDocument;
     28    String original_location;
     29   
    2830    public METSDocument(DocumentID id) {
    2931        super(id);
     
    5052        domDocument = builder.parse(file);
    5153
    52         int filePosition = file.getPath().indexOf("import/")+7;
    53         parseFilePath = file.getPath().substring(0, filePosition);
     54        //int filePosition = file.getPath().indexOf("import/")+7;
     55        //parseFilePath = file.getPath().substring(0, filePosition);
     56        parseFilePath = file.getParent(); // all refs should be relative to the current doc, or absolute
     57        parseFilePath += File.separator;
     58        this.original_location = parseFilePath;
    5459        // TODO: get all the types in the tree
    5560     
     
    139144    URL url = (URL) this.fileSet.getFile(0).getLocation();
    140145
    141     this.getSectionText("1");
     146    //this.getSectionText("1");
    142147   
    143148    if (url.getProtocol().equals("file")) {
    144         metsDoc = new HTMLDoc(url,url.getPath());
     149        String path = url.getPath();
     150        File f = new File(path);
     151        if (!f.isAbsolute() && this.original_location != null) {
     152        path = this.original_location + path;
     153        System.err.println("new path ="+path);
     154        }
     155        metsDoc = new HTMLDoc(url,path);
    145156    } else {
    146157        metsDoc = new HTMLDoc(url);
  • trunk/greenstone3-extensions/gs3build/src/org/greenstone/gsdl3/gs3build/doctypes/RecogniserInterface.java

    r12188 r12191  
    44import org.w3c.dom.Element;
    55
     6import org.greenstone.gsdl3.gs3build.CollectionManager;
    67import org.greenstone.gsdl3.gs3build.metadata.*;
    78
     
    1314public interface RecogniserInterface
    1415{
     16    public void setCollectionManager(CollectionManager coll_man);
    1517    public boolean configure(Element config_elem);
    1618    public void setListRepository(DocumentList docList);
  • trunk/greenstone3-extensions/gs3build/src/org/greenstone/gsdl3/gs3build/indexers/MGIndexer.java

    r12188 r12191  
    1818import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
    1919import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
     20import org.greenstone.gsdl3.gs3build.doctypes.AbstractDocument;
    2021import org.greenstone.gsdl3.gs3build.doctypes.HTMLDocument;
    2122import org.greenstone.gsdl3.gs3build.doctypes.METSDocument;
     
    2425import org.greenstone.gsdl3.gs3build.util.DOMUtils;
    2526import org.greenstone.gsdl3.util.GSXML;
     27import org.greenstone.gsdl3.util.GSFile;
    2628import org.greenstone.gsdl3.util.Misc;
    2729import org.greenstone.gsdl3.util.Processing;
     
    293295        //metsDoc.setModified(true);
    294296        //  System.out.println("Assigning " + this.sectionSeqNo + " to " + metsDoc.getID() + " " + division.getLabel());
    295         } // section level
     297        } // first pass
    296298   
    297299        // append an 'end of section' marker
     
    408410    public boolean indexDocument(DocumentID docID, DocumentInterface document)
    409411    {
    410    
    411     if (!this.firstDocument) {
    412         this.indexBuffer.append(END_OF_DOCUMENT);
    413         mgPasses.processDocument(indexBuffer.toString());
    414         this.indexBuffer.delete(0, this.indexBuffer.length());
    415        
    416     }
    417 
     412    int count = ((AbstractDocument)document).getNumSections();
    418413    String docText = null;
    419414    // set the mgseqno if first pass
     
    426421   
    427422    //long start = System.currentTimeMillis();
    428     Document domDocument = document.getDOMDocument();
    429     if (domDocument != null) {
    430         System.err.println("dom doc is not null");
    431         METSStructure sections = document.getDocumentStructure().getStructure("Section");
    432         if (sections != null) {
    433         System.err.println("sections are not null");
    434         docText = this.prepareDOM(document, domDocument, sections, "gsdl3"); //this.name, "gsdl3", this.field);
    435         //  System.out.println(docText);
    436         }
    437     }
     423    if (this.current_index.getLevel().equals("section")) {
     424
     425        Document domDocument = document.getDOMDocument();
     426        if (domDocument != null) {
     427        System.err.println("dom doc is not null");
     428        METSStructure sections = document.getDocumentStructure().getStructure("Section");
     429        if (sections != null) {
     430            System.err.println("sections are not null");
     431            docText = this.prepareDOM(document, domDocument, sections, "gsdl3"); //this.name, "gsdl3", this.field);
     432            //  System.out.println(docText);
     433        }
     434        }
     435    }
     436       
     437   
    438438    //long finish = System.currentTimeMillis();
    439439    //System.err.println("dom doc = "+ Long.toString(finish-start));
     
    447447        if (field.equals("text")) {
    448448            doc_text_buffer.append(document.getDocumentText());
     449            doc_text_buffer.append(" ");
    449450        }  else {
    450451            // its a metadata - do namespace properly!!
     
    455456                String value = valueIter.next().toString();
    456457                doc_text_buffer.append(value);
     458                doc_text_buffer.append(" ");
    457459            }
    458460            }
     
    461463        docText = doc_text_buffer.toString();
    462464        sectionSeqNo ++;
     465        int num_secs = 0;
    463466    }
    464467    //finish = System.currentTimeMillis();
     
    467470    this.indexBuffer.append(docText);
    468471    // remember that we're not on the first document,
    469     this.firstDocument = false;
     472    //this.firstDocument = false;
    470473    this.documentSeqNo ++;
     474    //if (!this.firstDocument) {
     475    this.indexBuffer.append(END_OF_DOCUMENT);
     476    mgPasses.processDocument(indexBuffer.toString());
     477    String filename="";
     478    try {
     479        filename = "pass"+this.pass+"doc"+this.documentSeqNo+".txt";
     480        System.err.println("trying to write to "+filename);
     481        GSFile.writeFile(indexBuffer.toString().getBytes(), filename);
     482    } catch (Exception e) {
     483        System.err.println("COUldn't write to file, "+filename);
     484    }
     485    this.indexBuffer.delete(0, this.indexBuffer.length());
     486       
     487   
    471488
    472489    return true;
     
    487504    this.indexBuffer = new StringBuffer();
    488505    int indexNo = this.pass/2;
    489     this.current_index = null;
    490506   
    491507    this.current_index = (MGIndex) this.indexes.get(indexNo);
     
    512528        this.textStem = this.indexStem;
    513529    }
     530   
    514531    mgPasses.setFileName(this.indexStem);
    515532    if (!Misc.isWindows()) {
     
    573590    } catch (Exception e) {}
    574591   
    575     int exit_value = 0;
     592    int exit_value = mgPasses.exitValue();
    576593    System.out.println("Pass " + this.pass + " completed with " + exit_value);
    577594    if (exit_value !=0) {
  • trunk/greenstone3-extensions/gs3build/src/org/greenstone/gsdl3/gs3build/indexers/MGPPIndexer.java

    r12188 r12191  
    88import org.greenstone.gsdl3.gs3build.doctypes.METSDocument;
    99import org.greenstone.gsdl3.util.Misc;
     10import org.greenstone.gsdl3.util.GSFile;
    1011import org.greenstone.gsdl3.util.GSXML;
    1112import org.greenstone.gsdl3.util.Processing;
    1213import org.greenstone.gsdl3.gs3build.xpointer.XPointer;
    1314import org.greenstone.gsdl3.gs3build.metadata.*;
     15import org.greenstone.gsdl3.gs3build.util.DOMUtils;
     16
    1417import java.io.InputStream;
    1518import java.io.OutputStream;
     
    2225import org.w3c.dom.Element;
    2326import org.w3c.dom.Node;
     27import org.w3c.dom.NodeList;
    2428import org.w3c.dom.Document;
    25 
    26 public class MGPPIndexer extends AbstractIndexer
     29import org.w3c.dom.NamedNodeMap;
     30
     31import org.greenstone.gsdl3.util.XMLConverter;
     32
     33public class MGPPIndexer //extends AbstractIndexer
     34    implements IndexerInterface
    2735{
    2836    int          pass;
    2937    int          documentSeqNo;
    3038    int          sectionSeqNo;
    31     String       name;
    3239    boolean      firstDocument;
    3340    File indexDirectory;
    34     File textDirectory;
     41    //    File textDirectory;
    3542    String       indexStem;
    36     String       textStem;
     43    //    String       textStem;
    3744    StringBuffer indexBuffer;
    3845    String       outputDirectory;
    39     //String       outputStem;
    40 //     String       passExtra;
    41 //     InputStream   indexerFeedback;
    42 //     InputStream  indexerErrors;
    43 //     OutputStream indexerTextfeed;
    44 //     Process      mgpp_passes;
    45     //String       overallName;
    46     String       currentIndexName;
    47     String       currentIndexLevel;
    48        String       currentIndexField;
     46    String       overallName;
     47   
     48    List indexes;
     49    MGPPIndex current_index = null;
    4950    MGPPPassesWrapper mgppPasses;
    5051   
    5152
    52     static final String documentSeparator = "<Document>";
    53     static final String sectionSeparator = "<Section>";
    54 
    55     static final String START_OF_DOCUMENT = "<Document>";
    56     static final String END_OF_DOCUMENT = "</Document>";
    57     static final String START_OF_SECTION = "<Section>";
    58     static final String END_OF_SECTION = "</Section>";
     53    static final String DOCUMENT = "Doc";
     54    static final String SECTION = "Sec";
     55    static final String START_OF_DOCUMENT = "<"+DOCUMENT+">";
     56    static final String END_OF_DOCUMENT = "</"+DOCUMENT+">";
     57    static final String START_OF_SECTION = "<"+SECTION+">";
     58    static final String END_OF_SECTION = "</"+SECTION+">";
    5959   
    6060   
     
    6767    public String name = null;
    6868    public String doc_level = null;
    69     public ArrayList levels = null;
    70     public ArrayList fields = null;
     69    public List levels = null;
     70    public List fields = null;
    7171    boolean error = false;// assume built until we get an error
     72   
     73    public MGPPIndex(Element index_element) {
     74       
     75        this.fields = new ArrayList();
     76        this.levels = new ArrayList();
     77        this.name = index_element.getAttribute(GSXML.NAME_ATT);
     78        if (this.name.equals("")) {
     79        // TODO make this dynamic
     80        this.name = "xx";
     81        }
     82        NodeList children = index_element.getChildNodes();
     83        for (int c = 0; c < children.getLength(); c ++) {
     84        Node child = children.item(c);
     85       
     86        if (child.getNodeType() == Node.ELEMENT_NODE) {
     87            String name = child.getNodeName();
     88           
     89            if (name.equals(GSXML.LEVEL_ELEM)) {
     90            String level = DOMUtils.getNodeChildText(children.item(c));
     91            this.levels.add(level);
     92            }
     93            else if (name.equals(GSXML.FIELD_ELEM)) {
     94            String fieldName = DOMUtils.getNodeChildText(children.item(c));
     95            this.fields.add(fieldName);
     96            }
     97        }
     98        }
     99    }
    72100   
    73101    public MGPPIndex(String name) {
    74102        this.name = name;
    75         doc_level = "Document";
    76     }
    77 
     103        this.doc_level = DOCUMENT;
     104        this.fields = new ArrayList();
     105        this.levels = new ArrayList();
     106    }
     107   
    78108    public void setDocLevel(String doc_level) {
    79109        this.doc_level = doc_level;
     
    88118        this.fields.add(field);
    89119    }
    90 
     120    public List getLevels() {
     121        return this.levels;
     122    }
     123    public List getFields() {
     124        return this.fields;
     125    }
     126    public String getName() {
     127        return this.name;
     128    }
    91129    public boolean hasError() {
    92130        return this.error;
     
    101139    public MGPPIndexer(String name)
    102140    {
    103     this.name = name;
     141    this.overallName = name;
     142    this.indexes = new ArrayList();
    104143    //this.passExtra = "";
    105144    }
     
    107146    public String getName()
    108147    {
    109     return this.name;
     148    return this.overallName;
     149    }
     150   
     151    public String getIndexType()
     152    {
     153    return MGPP_INDEX_TYPE;
     154    }
     155
     156    // for now make all indexes use document and section levels.
     157    // then when writing the buildconfig, only display the levels
     158    // that the user has specified (likely to be both doc and sec).
     159    public boolean configure(Node search_node)
     160    {
     161    NodeList index_children = GSXML.getChildrenByTagName(search_node, GSXML.INDEX_ELEM);
     162   
     163    // add a text 'index' - we should be able to turn this off in the config file?
     164    MGPPIndex text_index = new MGPPIndex("text");
     165    text_index.addField("text");
     166    // always do eveything at doc and sec level at the moment
     167    text_index.addLevel(DOCUMENT);
     168    text_index.addLevel(SECTION);
     169    indexes.add(text_index);
     170   
     171    for (int i = 0; i < index_children.getLength(); i ++) {
     172        Element index_elem = (Element)index_children.item(i);
     173        MGPPIndex index = new MGPPIndex(index_elem);
     174        if (index.getName() != null && index.getLevels() != null && index.getFields()!= null) {
     175        indexes.add(index);
     176        } else {
     177        System.err.println("invalid index spec, not including"+new XMLConverter().getPrettyString(index_elem));
     178        }
     179    }
     180    // TODO make sure all index names are unique
     181    return true;
    110182    }
    111183
     
    120192       
    121193        // attempt to ensure that the text subdirectory exists
    122         this.textDirectory = new File(outputDirectory, "text");
    123         if (!textDirectory.exists()) {
    124         if (!textDirectory.mkdir()) {
    125             return false;
    126         }
    127         }
    128         else if (!textDirectory.isDirectory()) {
    129         return false;
    130         }
    131         this.textStem = this.textDirectory.getPath() + File.separator + INDEX_FILE_STEM;
    132        
    133         // attempt to ensure that the index subdir exists
    134         this.indexDirectory = new File(outputDirectory, "idx");
    135         if (!indexDirectory.exists()) {
    136         if (!indexDirectory.mkdir()) {
    137             return false;
    138         }
    139         }
    140         else if (!indexDirectory.isDirectory()) {
    141         return false;
    142         }
    143         this.indexStem = this.indexDirectory.getPath() + File.separator + INDEX_FILE_STEM;
     194//      this.textDirectory = new File(outputDirectory, "text");
     195//      if (!textDirectory.exists()) {
     196//      if (!textDirectory.mkdir()) {
     197//          return false;
     198//      }
     199//      }
     200//      else if (!textDirectory.isDirectory()) {
     201//      return false;
     202//      }
     203//      this.textStem = this.textDirectory.getPath() + File.separator + INDEX_FILE_STEM;
     204       
     205//      // attempt to ensure that the index subdir exists
     206//      this.indexDirectory = new File(outputDirectory, "idx");
     207//      if (!indexDirectory.exists()) {
     208//      if (!indexDirectory.mkdir()) {
     209//          return false;
     210//      }
     211//      }
     212//      else if (!indexDirectory.isDirectory()) {
     213//      return false;
     214//      }
     215//      this.indexStem = this.indexDirectory.getPath() + File.separator + INDEX_FILE_STEM;
    144216       
    145217        // Sign to the user which mg directory is being used...
    146         System.out.println("Output MGPP text directory is " + this.textStem);
    147         System.out.println("Output MGPP index directory is " + this.indexStem);
     218//      System.out.println("Output MGPP text directory is " + this.textStem);
     219//      System.out.println("Output MGPP index directory is " + this.indexStem);
    148220    }
    149221    this.pass = 0;
     
    151223    }
    152224
    153     public String getIndexType()
    154     {
    155     return MGPP_INDEX_TYPE;
    156     }
    157 
    158     public boolean addIndex(String name, String level, String field)
    159     {
    160 //  if (level == "doc_level") {
    161 //      passExtra = " -J " + level;
    162 //  }
    163 //  else {
    164 //      passExtra = " -K " + level;
    165 //  }
    166     return true;
    167     }
     225
     226//     public boolean addIndex(String name, String level, String field)
     227//     {
     228// //   if (level == "doc_level") {
     229// //       passExtra = " -J " + level;
     230// //   }
     231// //   else {
     232// //       passExtra = " -K " + level;
     233// //   }
     234//  return true;
     235//     }
    168236
    169237    /**
     
    176244    if (this.pass == 0) {
    177245        document.removeAllMetadata("gsdl3", "mgseqno");
    178     }
    179 
    180     // why do this at the start and not at the end???
    181     if (!this.firstDocument) {
    182         // Send a '</Document>' at the end of the doc
    183         this.indexBuffer.append(END_OF_DOCUMENT);
    184         mgppPasses.processDocument(indexBuffer.toString());
    185         this.indexBuffer.delete(0, this.indexBuffer.length());
    186     }
    187 
     246        document.addDocumentMetadata("gsdl3", "mgseqno", this.overallName+"."+Integer.toString(this.sectionSeqNo));
     247    }
     248   
     249   
    188250    String docText = null;
    189251   
    190     //int startSeqNo = this.sectionSeqNo;
    191     //this.sectionSeqNo ++;
    192     int startSeqNo = this.documentSeqNo;
    193 
     252    this.sectionSeqNo ++;
     253   
    194254    Document domDocument = document.getDOMDocument();
    195255    if (domDocument != null) {
     
    198258        if (sections != null) {
    199259        System.err.println("sections are not null");
    200         docText = this.prepareDOM(document, domDocument, sections, "gsdl3"); //this.name, "gsdl3", this.field);
    201         //  System.out.println(docText);
     260        docText = this.prepareDOM(document, domDocument, sections, "gsdl3");
    202261        }
    203262    }
    204263    if (docText == null) {
    205264        System.err.println("dom doc or sections was null - asking for doc text");
    206         //if (this.currentIndexField.equals("text")) {
    207         //docText = Character.toString(END_OF_DOCUMENT) + document.getDocumentText();
    208         docText = document.getDocumentText();
    209         //}
    210 //      else {
    211 //      StringBuffer textBuffer = new StringBuffer();
    212 //      //textBuffer.append(END_OF_DOCUMENT);
    213 //      List values = document.getDocumentMetadataItem("gsdl3", this.currentIndexField);
    214 //      if (values != null) {
    215 //          Iterator valueIter = values.iterator();
    216 //          while (valueIter.hasNext()) {
    217 //          String value = valueIter.next().toString();
    218            
    219 //          textBuffer.append(value);
    220 //          if (valueIter.hasNext()) {
    221 //              //textBuffer.append(END_OF_SECTION);
    222 //              //        sectionSeqNo ++;
    223 //          }
    224 //          }
    225 //      }
    226 //      else {
    227 //          textBuffer.append("No data");
    228 //      }
    229 //      docText = textBuffer.toString();
    230 //      }
     265        StringBuffer doc_text_buffer = new StringBuffer();
     266        List fields = this.current_index.getFields();
     267        for (int i=0; i<fields.size(); i++) {
     268        String field = (String)fields.get(i);
     269        if (field.equals("text")) {
     270            doc_text_buffer.append(document.getDocumentText());
     271            doc_text_buffer.append(" ");
     272        }  else {
     273            // its a metadata - do namespace properly!!
     274            List values = document.getDocumentMetadataItem("gsdl3", field);
     275            if (values != null) {
     276            Iterator valueIter = values.iterator();
     277            while (valueIter.hasNext()) {
     278                String value = valueIter.next().toString();
     279                doc_text_buffer.append(value);
     280                doc_text_buffer.append(" ");
     281            }
     282            }
     283        }
     284        } // for each field
     285        docText = doc_text_buffer.toString();
    231286        sectionSeqNo ++;
    232287    }
    233 
    234         //try {                 
    235         //  this.indexerTextfeed.write(documentSeparator.getBytes(), 0, documentSeparator.getBytes().length);
    236         // }
    237 //      catch (IOException ex) {
    238 //      System.out.println("Bad output on end of document" + ex);
    239 //      ex.printStackTrace();
    240 //      return false;
    241 //      }
    242 //  }
    243 
     288   
    244289    this.indexBuffer.append(START_OF_DOCUMENT);
    245     //String docText = document.getDocumentText();
     290    this.indexBuffer.append(START_OF_SECTION);
    246291    this.indexBuffer.append(docText);
    247     //int startSeqNo = this.documentSeqNo;
    248    
    249 //  byte [] bytes = docText.getBytes();
    250 //  int pos = 0, end = bytes.length;
    251    
    252 //  try {
    253 //      while (pos < end) {
    254 //      this.indexerTextfeed.write(bytes, pos, (end - pos > 512 ? 512 : end - pos));
    255 //      pos = pos + 512;
    256    
    257 //      try {
    258 //          while (this.indexerFeedback.available() > 0) {
    259 //          byte b[] = new byte[this.indexerFeedback.available()];
    260 //          System.out.println("Feedback of " + this.indexerFeedback.available());
    261 //          this.indexerFeedback.read(b);
    262 //          System.out.println(b);
    263 //          }
    264 //      }
    265 //      catch (IOException ex) {
    266            
    267 //      }
    268 
    269 
    270 //      try {
    271 //          while (this.indexerErrors.available() > 0) {
    272 //          byte b[] = new byte[this.indexerErrors.available()];
    273 //          System.out.println("Feedback of " + this.indexerErrors.available());
    274 //          this.indexerErrors.read(b);
    275 //          System.out.println(new String(b));
    276 //          }
    277 //      }
    278 //      catch (IOException ex){
    279            
    280 //      }
    281 //      }
    282 //  }
    283 //  catch (IOException ex) {
    284 //      System.out.println("Bad output during document write " + ex + " " + pos + " " + end);
    285 //      ex.printStackTrace();
    286 //      return false;
    287 //  }
     292    this.indexBuffer.append(END_OF_SECTION);
     293    this.indexBuffer.append(END_OF_DOCUMENT);
     294    this.mgppPasses.processDocument(indexBuffer.toString());
     295    this.indexBuffer.delete(0, this.indexBuffer.length());
     296
    288297    this.firstDocument = false;
    289298
    290     if (this.pass == 0) {   
    291         document.addDocumentMetadata("gsdl3", "mgseqno", "dtx."+Integer.toString(startSeqNo));
    292     }
    293299    this.documentSeqNo++;
    294300   
    295 //  try {
    296 //      while (this.indexerErrors.available() > 0) {
    297 //      char c = (char) this.indexerErrors.read();
    298 //      System.out.println(c);
    299 //      }
    300 //      while (this.indexerFeedback.available() > 0) {
    301 //      byte b[] = new byte[this.indexerFeedback.available()];
    302 //      System.out.println("Feedback of " + this.indexerFeedback.available());
    303 //      this.indexerFeedback.read(b);
    304 //      }
    305 //  }
    306 //  catch (IOException ex) {
    307        
    308 //  }
    309301    return true;
    310302    }
     
    323315    this.indexBuffer = new StringBuffer();
    324316   
    325     MGPPIndex index = null; // do something with this!!
    326    
     317    int indexNo = this.pass/2;
     318    this.current_index = (MGPPIndex) this.indexes.get(indexNo);
     319
     320    if (this.current_index.hasError()) {
     321        // an error has already occurred for this index, don't continue
     322        System.out.println("pass "+this.pass+": aborted due to errors in the previous pass");
     323        return false;
     324    }
     325   
     326    // attempt to ensure that the text/index subdirectory exists
     327    this.indexDirectory = new File(outputDirectory, current_index.getName());
     328    if (!indexDirectory.exists()) {
     329        if (!indexDirectory.mkdir()) {
     330        return false;
     331        }
     332    }
     333    else if (!indexDirectory.isDirectory()) {
     334        return false;
     335    }
     336
     337    this.indexStem = this.indexDirectory.getPath() + File.separatorChar + INDEX_FILE_STEM;  // TODO: modify for index
     338//  if (this.pass == 0) {
     339//      // first pass, also set up the textStem
     340//      this.textDirectory = this.indexDirectory;
     341//      this.textStem = this.indexStem;
     342//  }
     343
    327344    // get the parameters for this execution of mg_passes
    328     mgppPasses.setFileName((this.pass < 2 ? this.textStem : this.indexStem ));
     345    mgppPasses.setFileName(this.indexStem);
    329346    if (!Misc.isWindows()) {
    330347        mgppPasses.setBasePath("/");
    331348    }
    332349   
    333     mgppPasses.setDocumentTag("Document");
    334     //mgppPasses.addLevelTag("Section");
    335 
    336     this.currentIndexLevel = "Document";// index.getLevel();
    337     this.currentIndexField = "text";//index.getField();
    338     this.currentIndexName = "idx"; //index.getName();
    339        
    340 
    341     switch (this.pass) {
     350    // always use Doc and Sec for now
     351    mgppPasses.setDocumentTag(DOCUMENT);
     352    mgppPasses.addLevelTag(SECTION);
     353   
     354    //this.currentIndexLevel = "Document";// index.getLevel();
     355    //this.currentIndexField = "text";//index.getField();
     356    //this.currentIndexName = "idx"; //index.getName();
     357       
     358    int mgppPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2);
     359    switch (mgppPass) {
    342360    case 0:
    343361        // -T1
    344362        mgppPasses.addPass(MGPPPassesWrapper.TEXT_PASS_1);
    345         //mgpp_passes = Runtime.getRuntime().exec("mgpp_passes " + passExtra + " -f " + this.outputStem + " -T1");
    346363        break;
    347364       
     
    349366        // -T2
    350367        mgppPasses.addPass(MGPPPassesWrapper.TEXT_PASS_2);
    351         //mgpp_passes = Runtime.getRuntime().exec("mgpp_passes " + passExtra + " -f " + this.outputStem +" -T2");
    352368        break;
    353369       
     
    355371        // -I1
    356372        mgppPasses.addPass(MGPPPassesWrapper.INDEX_PASS_1);
    357         //mgpp_passes = Runtime.getRuntime().exec("mgpp_passes " + passExtra + " -f " + this.outputStem +" -I1");
    358373        break;
    359374       
    360375    case 3:
    361         //Process p = Runtime.getRuntime().exec("mgpp_perf_hash_build -f " + this.outputStem);
    362         //p.waitFor();
    363376        // -I2
    364377        mgppPasses.addPass(MGPPPassesWrapper.INDEX_PASS_2);
    365         //mgpp_passes = Runtime.getRuntime().exec("mgpp_passes " + passExtra + " -f " + this.outputStem +" -I2");
    366378        break;
    367379    }
    368        
    369     //this.indexerFeedback = mgpp_passes.getInputStream();
    370     //   this.indexerErrors   = mgpp_passes.getErrorStream();
    371     //   this.indexerTextfeed = mgpp_passes.getOutputStream();
    372     //  }
    373     //catch (IOException ex) {
    374     //   System.out.println(ex);
    375     //   ex.printStackTrace();
    376     //   return false;
    377     //}/   
    378     //catch (InterruptedException ex) {
    379     //   System.out.println(ex);
    380     //   ex.printStackTrace();
    381     //   return false;
    382     //}
     380   
    383381    mgppPasses.init();
    384382    System.out.println("Pass " + this.pass);
     
    391389    public boolean endPass(int passNumber)
    392390    {
    393     // TODO: end pass
    394     Process p;
    395     MGPPIndex index = null; // do something with this!!
    396391    try {
    397392        this.indexBuffer.append(END_OF_DOCUMENT);
     
    403398        System.out.println(ex);
    404399    }
     400   
    405401    mgppPasses.finish();
     402   
    406403    try {
    407404        Thread.sleep(1000);
     
    411408    System.out.println("Pass " + this.pass + " completed with " + exit_value);
    412409    if (exit_value !=0) {
    413         //assume something has gone wrong, don't continue
    414 //      if (index != null) {
    415 //      index.setError(true);
    416 //      return false;
    417 //      }
    418     }
     410        this.current_index.setError(true);
     411        return false;
     412    }
     413   
     414    int mgppPass = this.pass < 2 ? this.pass : ((this.pass % 2) + 2);
    419415   
    420416    String osextra = "";
     
    423419    }
    424420
    425     switch (this.pass) {
     421    switch (mgppPass) {
    426422    case 0:
    427         //System.exit(1);
    428423        System.out.println("Compressing dictionary");
    429         exit_value = Processing.runProcess("mgpp_compression_dict -f " + this.textStem + " -S -H -2 -k 5120"+ osextra);
     424        exit_value = Processing.runProcess("mgpp_compression_dict -f " + this.indexStem + " -S -H -2 -k 5120"+ osextra);
    430425       
    431426        if (exit_value == 0) {
     
    433428        } else {
    434429        System.err.println("Error from mgpp_compression_dict: " + exit_value);
    435         //index.setError(true);
     430        this.current_index.setError(true);
    436431        return false;
    437432        }
     
    445440        } else {
    446441        System.err.println("Unable to build the perfect hash");
    447         //index.setError(true);
     442        this.current_index.setError(true);
    448443        return false;
    449444        }
     
    457452        } else {
    458453        System.err.println("Unable to create weights file");
    459         //index.setError(true);
     454        this.current_index.setError(true);
    460455        return false;
    461456        }
     
    467462        } else {
    468463        System.out.println("Unable to create inverted dictionary file");
    469         //index.setError(true);
     464        this.current_index.setError(true);
    470465        return false;
    471466        }
     
    477472        } else {
    478473        System.out.println("Unable to create stemmed index 1");
    479         //index.setError(true);
     474        this.current_index.setError(true);
    480475        return false;
    481476        }
     
    486481        } else {
    487482        System.out.println("Unable to create stemmed index 2");
    488         //index.setError(true);
     483        this.current_index.setError(true);
    489484        return false;
    490485        }
     
    494489        } else {
    495490        System.out.println("Unable to create stemmed index 3");
    496         //index.setError(true);
     491        this.current_index.setError(true);
    497492        return false;
    498493        }
     
    516511    public int getNumberOfPasses()
    517512    {
     513    //return this.indexes.size()*2;
    518514    return 4;
    519515    }
    520516
    521517    public boolean addServiceDescriptions(Element service_rack_list) {
     518   
     519    // we only have one real index at the moment, - the first index in the list will be the text one.
     520    MGPPIndex index = (MGPPIndex)this.indexes.get(1);
     521    if (index.hasError()) {
     522        // we weren't able to create any indexes - don't add a search service
     523        return false;
     524    }
     525   
    522526    Document doc = service_rack_list.getOwnerDocument();
    523 
    524     // generate the list of indexes
     527    Element search_service_elem = doc.createElement(GSXML.SERVICE_CLASS_ELEM);
     528    service_rack_list.appendChild(search_service_elem);
     529    Element retrieve_service_elem = doc.createElement(GSXML.SERVICE_CLASS_ELEM);       
     530    service_rack_list.appendChild(retrieve_service_elem);
     531   
     532    // generate the list of indexes - with only one index in it
    525533    Element index_list = doc.createElement(GSXML.INDEX_ELEM+GSXML.LIST_MODIFIER);
    526     Element e = doc.createElement(GSXML.INDEX_ELEM);
    527     e.setAttribute(GSXML.NAME_ATT, "idx");
    528     index_list.appendChild(e);
    529     String def_index = "idx";
    530    
    531 //  boolean found_index = false;
    532 //  String def_index = ""; // the default index will just be the first one created for now.
    533 //  for (int i=0; i<this.indexes.size(); i++) {
    534 //      MGIndex index = (MGIndex)this.indexes.get(i);
    535 //      if (!index.hasError()) {
    536 //      Element e = doc.createElement(GSXML.INDEX_ELEM);
    537 //      e.setAttribute(GSXML.NAME_ATT, index.getName());
    538 //      index_list.appendChild(e);
    539 //      if (found_index == false) {
    540 //          // this is the first index
    541 //          found_index = true;
    542 //          def_index = index.getName();
    543 //      }
    544 //      }
    545 //  }
    546    
    547 //  if (!found_index) {
    548 //      // no indexes were able to be created, so we can't use them or the text
    549 //      return false;
    550 //  }
    551 
    552     Element f = doc.createElement(GSXML.FIELD_ELEM+GSXML.LIST_MODIFIER);
     534    Element index_elem = doc.createElement(GSXML.INDEX_ELEM);
     535    index_elem.setAttribute(GSXML.NAME_ATT, index.getName());
     536    index_list.appendChild(index_elem);
    553537   
    554538    Element default_index = doc.createElement("defaultIndex");
    555     default_index.setAttribute(GSXML.NAME_ATT, def_index);
     539    default_index.setAttribute(GSXML.NAME_ATT, index.getName());
    556540
    557541    Element base_index_name = doc.createElement("baseIndexPrefix");
    558     base_index_name.setAttribute(GSXML.NAME_ATT, "dtx");  //overallName);
     542    base_index_name.setAttribute(GSXML.NAME_ATT, overallName);
    559543
    560544    Element index_stem = doc.createElement("indexStem");
    561     index_stem.setAttribute(GSXML.NAME_ATT, "index");
    562 
    563     Element search_service_elem = doc.createElement(GSXML.SERVICE_CLASS_ELEM);
    564     Element retrieve_service_elem = doc.createElement(GSXML.SERVICE_CLASS_ELEM);
     545    index_stem.setAttribute(GSXML.NAME_ATT, INDEX_FILE_STEM);
     546   
    565547    Element default_level = doc.createElement("defaultLevel");
    566     default_level.setAttribute(GSXML.NAME_ATT, "Document");
    567 
    568     Element level_list = doc.createElement("levelList");
    569     Element level = doc.createElement("level");
    570     level.setAttribute(GSXML.NAME_ATT, "Document");
     548    default_level.setAttribute(GSXML.NAME_ATT, SECTION);
     549
     550    // always have doc and sec at the moment
     551    Element level_list = doc.createElement(GSXML.LEVEL_ELEM+GSXML.LIST_MODIFIER);
     552    Element level = doc.createElement(GSXML.LEVEL_ELEM);
     553    level.setAttribute(GSXML.NAME_ATT, DOCUMENT);
    571554    level_list.appendChild(level);
    572555
    573     Element field_list = doc.createElement("fieldList");
    574     Element field = doc.createElement("field");
    575     field.setAttribute(GSXML.NAME_ATT, "ZZ");
    576     field_list.appendChild(field);
    577 
    578     service_rack_list.appendChild(search_service_elem);
    579     service_rack_list.appendChild(retrieve_service_elem);
    580 
     556    level = doc.createElement(GSXML.LEVEL_ELEM);
     557    level.setAttribute(GSXML.NAME_ATT, SECTION);
     558    level_list.appendChild(level);
     559   
     560    Element field_list = doc.createElement(GSXML.FIELD_ELEM+GSXML.LIST_MODIFIER);
     561    Element field;
     562    List fields = index.getFields();
     563    for (int i=0; i<fields.size(); i++) {
     564        String f = (String) fields.get(i);
     565        field = doc.createElement(GSXML.FIELD_ELEM);
     566        field.setAttribute(GSXML.NAME_ATT, f);
     567        field_list.appendChild(field);
     568    }
     569   
    581570    search_service_elem.setAttribute(GSXML.NAME_ATT, "GS3MGPPSearch"); 
    582571    search_service_elem.appendChild(index_list);
     
    584573    search_service_elem.appendChild(level_list);
    585574    search_service_elem.appendChild(default_level);
    586 search_service_elem.appendChild(field_list); // do we need this??
     575    search_service_elem.appendChild(field_list);
    587576    search_service_elem.appendChild(base_index_name);
    588577    search_service_elem.appendChild(index_stem);
    589 
     578   
    590579    retrieve_service_elem.setAttribute(GSXML.NAME_ATT, "GS3MGPPRetrieve");
    591580    retrieve_service_elem.appendChild(default_level.cloneNode(true));
     
    597586 
    598587 
    599   private Node recurseDOM(DocumentInterface metsDoc, Node node,
    600               AbstractStructure structure, StringBuffer textBuffer,
    601               StringBuffer extraBuffer, String namespace)
    602               //String name, String namespace, String field)
    603   {
    604     // send out the ctrl-c...if this is
    605     if (structure.getStructureType().equals(METSDivision.DIVISION_TYPE)) {
     588    private Node recurseDOM(DocumentInterface metsDoc, Node node,
     589                AbstractStructure structure, StringBuffer textBuffer,
     590                StringBuffer extraBuffer, String namespace)
     591    //String name, String namespace, String field)
     592    {
     593    List fields = current_index.getFields();
     594    // send out the ctrl-c...if this is
     595    if (structure.getStructureType().equals(METSDivision.DIVISION_TYPE)) {
     596        // try doing this for all index types
     597        // actually we should only need to do this once ????
     598        if (this.pass == 0) {
     599        System.err.println("division structure");
     600        //if ((this.currentIndexName != null)) { // && this.level != null && this.level.equals(IndexerInterface.SECTION_LEVEL)) { //name.startsWith("s")) {
     601        METSDivision division = (METSDivision) structure;
     602
     603        // get the division metadata block
     604        METSDescriptive descriptive;
     605        String metadataId = division.getDefaultMetadataReference();
     606        if (metadataId == null) {
     607            descriptive = metsDoc.getDocumentMetadata().createDescriptive(division.getLabel());
     608            division.addMetadataReference(descriptive.getID());
     609        }
     610        else {
     611            // Get the descriptive item...
     612            descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId);
     613        }
     614       
     615        descriptive.addMetadata("gsdl3", "mgseqno", this.overallName + "." + Integer.toString(this.sectionSeqNo));
     616       
     617        metsDoc.setChanged(true);
     618        //metsDoc.setModified(true);
     619        //  System.out.println("Assigning " + this.sectionSeqNo + " to " + metsDoc.getID() + " " + division.getLabel());
     620        } // first pass
     621   
     622        // append an 'end of section' marker
     623        //textBuffer.append(END_OF_SECTION);
     624        this.sectionSeqNo ++;
     625        textBuffer.append(END_OF_SECTION);
     626        textBuffer.append(START_OF_SECTION);
     627        // for document-level indexes, always append an 'end of document' tag at the
     628        // end of the document for each section.  Otherwise, each section is followed
     629        // by an end of document character.  This ensures that all indexes use the
     630        // same document numbering...
     631       
     632//      if (this.current_index.getLevel().equals(IndexerInterface.DOCUMENT_LEVEL)) {
     633//      extraBuffer.append(END_OF_DOCUMENT);
     634//      }
     635//      else {
     636//      textBuffer.append(END_OF_DOCUMENT);
     637//      this.documentSeqNo ++;
     638//      }
     639       
     640        // produce the body here for metadata output of divisions - in the case of
     641        // text output, that will happen below...
     642       
     643        if (fields.size()>1 || !((String)fields.get(0)).equals("text")) {
     644        // if there is only text, don't do this
     645        METSDescriptive descriptive;
     646   
     647        METSDivision division = (METSDivision) structure;
     648
     649        String metadataId = division.getDefaultMetadataReference();
     650        // are there other metadata refs to get??
     651        descriptive = metsDoc.getDocumentMetadata().getDescriptiveById(metadataId);
     652        if (descriptive != null) {
     653            for (int i=0; i<fields.size(); i++) {
     654            String field = (String)fields.get(i);
     655            if (field.equals("text")) {
     656                continue;
     657            }
     658            List values = descriptive.getMetadata(namespace, field);
     659            if (values != null) {   
     660                Iterator valueIter = values.iterator();
     661                while (valueIter.hasNext()) {
     662                String value = valueIter.next().toString();
     663                textBuffer.append("<"+field+">");
     664                textBuffer.append(value);
     665                textBuffer.append("</"+field+">");
     666                }
     667            }
     668            }
     669        }
     670        }
     671    }
     672   
     673    // go through our children as required...
     674    Iterator children = structure.getChildIterator();
     675    Node startNode;
     676    boolean index_text = fields.contains("text");
     677    while (children.hasNext()) {
     678        AbstractStructure child = (AbstractStructure) children.next();
     679     
     680        // get xpointer for child
     681        // get start position node
     682        if (metsDoc.getDocumentType() == "METS"){
     683        startNode = ((METSDocument) metsDoc).getSectionStartNode((METSDivision) child);   
     684        } else {
     685        startNode = ((HTMLDocument) metsDoc).getSectionStartNode((METSDivision) child);
     686        }
     687       
     688        // while this node isn't the child's start node, produce the
     689        // HTML node text, if in text field mode...
     690        if (index_text) {
     691        while (node != startNode) {
     692            XPointer.printNode(node, textBuffer, false);
     693            node = XPointer.getNextNode(node);
     694        }
     695        }
     696       
     697        // recurse to child
     698        node = this.recurseDOM(metsDoc, node, child, textBuffer, extraBuffer, namespace); // name, namespace, field);
     699    } // while next child
     700
     701    // close a document - the actual closing \B will be done by the main
     702    // loop, so only a required \C is printed here...
     703    // why have we got STRUCTURE_TYPE here and DIVISION_TYPE above????
     704    if (structure.getStructureType().equals(METSStructure.STRUCTURE_TYPE)) {
     705        if (this.pass == 0) {
     706        System.err.println("structure structure");
     707        }
     708        if (index_text) {
     709        while (node != null) {
     710            XPointer.printNode(node, textBuffer, false);
     711            node = XPointer.getNextNode(node);
     712        }
     713        }
     714     
     715        //textBuffer.append(END_OF_SECTION);
     716        //this.sectionSeqNo ++;
     717     
     718    }
     719    return node;
     720    }
     721
     722   
     723    private void printNode(Node node, StringBuffer buffer,
     724               boolean preserve_format, boolean close) {
     725   
     726    if (node.getNodeType() == org.w3c.dom.Node.TEXT_NODE) {
     727        if (!close) {
     728        buffer.append(node.getNodeValue());
     729        buffer.append(" ");
     730        }
     731        return;
     732    }
     733   
     734    if (node.getNodeType() == org.w3c.dom.Node.ENTITY_NODE) {
     735       if (!close) {
     736           buffer.append("&");
     737           buffer.append(node.getNodeValue());
     738           buffer.append(";");
     739           buffer.append(" ");
     740       }
     741       return;
     742    }
     743    if (!preserve_format) return; // we don't want any xml output
     744    /*
     745    else if (node.getNodeType() == org.w3c.dom.Node.COMMENT_NODE) {
     746      if (!close) {
     747    buffer.append("<!-- ");
     748    buffer.append(node.getNodeValue());
     749    buffer.append(" -->");
     750      }
     751      }*/
     752    if (node.getNodeType() == org.w3c.dom.Node.ELEMENT_NODE) {
     753       
     754        boolean hasChildren = (node.getChildNodes().getLength() > 0);
     755       
     756        if (close) {
     757        if (hasChildren) {
     758            // put the close node
     759            buffer.append("</");
     760            buffer.append(node.getNodeName());
     761            buffer.append(">");
     762        }
     763        // else it was closed off previously
     764        return;
     765        }
     766       
     767       
     768        buffer.append("<");
     769        buffer.append(node.getNodeName());
     770        NamedNodeMap attributes = ((Element) node).getAttributes();
     771        for (int a = 0; a < attributes.getLength(); a ++) {
     772        Node attributeNode = attributes.item(a);
     773        buffer.append(" ");
     774        buffer.append(attributeNode.getNodeName());
     775       
     776        String value = attributeNode.getNodeValue();
     777        if (value != null && value.length() > 0) {
     778            buffer.append("=\"");
     779            buffer.append(value);
     780            buffer.append("\"");
     781        }
     782        }
     783       
     784
     785        if (!hasChildren) {
     786        buffer.append(" /");
     787        }
     788        buffer.append(">");
     789       
     790        //if (close && node.getNodeName()=="Section") {
     791        //buffer.append((char) 3);
     792        // }
     793    }
     794   
     795    }
     796   
     797    /*
     798    private Node recurseDOMold(DocumentInterface metsDoc, Node node,
     799                AbstractStructure structure, StringBuffer textBuffer,
     800                StringBuffer extraBuffer, String namespace)
     801    //String name, String namespace, String field)
     802    {
     803    List fields = current_index.getFields();
     804    // send out the ctrl-c...if this is
     805    if (structure.getStructureType().equals(METSDivision.DIVISION_TYPE)) {
    606806    // try doing this for all index types
    607     if ((this.currentIndexName != null)) { // && this.level != null && this.level.equals(IndexerInterface.SECTION_LEVEL)) { //name.startsWith("s")) {
     807        if ((this.currentIndexName != null)) { // && this.level != null && this.level.equals(IndexerInterface.SECTION_LEVEL)) { //name.startsWith("s")) {
    608808        METSDivision division = (METSDivision) structure;
    609809
     
    718918    return node;
    719919  }
    720 
     920    */
    721921    private String prepareDOM(DocumentInterface metsDoc, Document document, METSStructure structure, String namespace)
    722     //  String name, String namespace, String field)
    723   { StringBuffer extraBuffer = new StringBuffer();
    724     Node node = document.getDocumentElement();
    725     StringBuffer textBuffer = new StringBuffer();
    726 
    727     this.recurseDOM(metsDoc, node, structure, textBuffer, extraBuffer, namespace); //name, namespace, field);
    728     textBuffer.append(extraBuffer.toString());
    729     return textBuffer.toString();
    730   }
    731 
     922    //  String name, String namespace, String field)
     923    {
     924    StringBuffer extraBuffer = new StringBuffer();
     925    Node node = document.getDocumentElement();
     926    StringBuffer textBuffer = new StringBuffer();
     927   
     928    this.recurseDOM(metsDoc, node, structure, textBuffer, extraBuffer, namespace);
     929    textBuffer.append(extraBuffer.toString());
     930    return textBuffer.toString();
     931    }
     932   
    732933}
Note: See TracChangeset for help on using the changeset viewer.