Changeset 6453


Ignore:
Timestamp:
2004-01-12T15:55:11+13:00 (20 years ago)
Author:
cs025
Message:

Extended extractor manager slightly - implemented metadata.xml support

Location:
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/ExtractorManager.java

    r6013 r6453  
    99public class ExtractorManager
    1010{
     11  public static final String ACCUMULATE_MODE = "accumulate";
     12
    1113  DocumentList          documents;
    1214  ExtractorInterface [] list;
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/GMLExtractor.java

    r6289 r6453  
    5454
    5555    String mode = attributes.getValue("mode");
    56     this.accumulate = mode.equals(ACCUMULATE_MODE);
     56    this.accumulate = mode.equals(ExtractorManager.ACCUMULATE_MODE);   
    5757      }
    5858    }
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/MetaXMLExtractor.java

    r6289 r6453  
    1010import org.greenstone.gsdl3.gs3build.util.GS2TextFileHandler;
    1111
     12import org.apache.xerces.parsers.SAXParser;
    1213import org.xml.sax.XMLReader;
    1314import org.xml.sax.InputSource;
     
    1920import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
    2021import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
    21 import org.greenstone.gsdl3.gs3build.doctypes.IndexDocument;
     22import org.greenstone.gsdl3.gs3build.doctypes.MetadataDocument;
    2223import org.greenstone.gsdl3.gs3build.doctypes.DocumentLoader;
    2324import org.greenstone.gsdl3.gs3build.doctypes.DocumentList;
     
    2728public class MetaXMLExtractor implements ExtractorInterface
    2829{
    29   class IndexHandlerException extends Exception
    30   { public IndexHandlerException(String value)
    31     { super(value);
    32     }
    33   }
    34 
    35   /**
    36    *  An inner class to handle GML files
    37    */
    38   class IndexHandler extends GS2TextFileHandler
    39   { List    labels;
    40     URL     base;
    41 
    42     IndexHandler(String content, URL url, DocumentList documentList) throws IndexHandlerException
    43     {
    44       super(content);
    45 
    46       this.labels  = new ArrayList();
    47       this.base    = url;
    48 
    49       String parentDir;
    50       int leaf = this.base.toString().lastIndexOf('/');
    51       if (leaf >= 0) {
    52     parentDir = this.base.toString().substring(0, leaf+1);
    53       }
    54       else {
    55     parentDir = this.base.toString();
    56       }
    57 
    58       // get the first line
    59       this.getLine();
    60 
    61       if (!this.hasMore())
    62       { throw new IndexHandlerException("No title line");
    63       }
    64      
    65       // get the first totem - it should be "key:"
    66       String entry = this.getEntry(true);
    67      
    68       // now get all the labels
    69       while (this.hasMore())
    70       { String label = this.getEntry(true);
    71         if (label == null || label.length() == 0) {
    72       continue;
    73     }
     30  /**
     31   *  An inner class to handle Metadata files
     32   */
     33  class MetadataHandler extends DefaultHandler
     34  { List         files;
     35    String       label;
     36    StringBuffer value;
     37    URL          url;
     38    boolean      inElement;
     39    boolean      accumulate;
     40    DocumentList documentList;
     41
     42    MetadataHandler(DocumentList documentList)
     43    { super();
     44   
     45      this.label = null;
     46      this.value = null;
     47      this.documentList = documentList;
     48    }
     49
     50    public void startElement(String URI, String localName, String qName, Attributes attributes)
     51    { if (localName.equals("FileName"))
     52      { this.value = new StringBuffer();
     53      }
     54      else if (localName.equals("FileSet"))
     55      { this.files = new ArrayList();
     56      }
     57      else if (localName.equals("Description"))
     58      {
     59      }
     60      else if (localName.equals("Metadata"))
     61      { this.label = attributes.getValue("name");
     62        this.value = new StringBuffer();
     63
     64    String mode = attributes.getValue("mode");
     65    this.accumulate = mode.equals(ExtractorManager.ACCUMULATE_MODE);   
     66      }
     67    }
     68
     69    public void endElement(String URI, String localName, String qName)
     70    { if (localName.equals("FileName"))
     71      { String file = this.value.toString();
     72        this.value = null;
     73    this.files.add(file);
     74      }
     75      else if (localName.equals("FileSet"))
     76      { // post the existing files item...
     77      }
     78      else if (localName.equals("Description"))
     79      {
     80      }
     81      else if (localName.equals("Metadata"))
     82      { List documentIds;
     83
     84        documentIds = this.documentList.findDocumentIdsUsingFiles(this.files, this.url.toString());
    7485   
    75     this.labels.add(label);
    76     System.out.println("Adding label: " + label);
    77       }
    78 
    79       while (this.hasMoreLines()) {
    80     this.getLine();
    81 
    82     // Get the file pattern itself
    83     String filePattern = this.getEntry(true);
    84     if (filePattern == null || filePattern.length() == 0) {
    85       continue;
    86     }
    87 
    88     // get a list of documents that match the file pattern
    89     List documentIds = documentList.findDocumentIdsUsingFile(filePattern);
     86    MetaXMLExtractor.postMetadata(this.url, this.files,
     87                      this.label, this.value.toString(),
     88                      this.accumulate);
    9089    if (documentIds != null) {
    9190      Iterator iterator = documentIds.iterator();
     
    9594    }
    9695
    97     // if no files match this data, then skip this row
    98     // TODO: raise a quality error message
    99     if (documentIds == null || documentIds.size() == 0) {
    100       continue;
    101     }
    102 
    103     // cache up the documents that match for speed improvements...
    104     List documents = new ArrayList();
    105     Iterator idIterator = documentIds.iterator();
    106     while (idIterator.hasNext()) {
    107       String docIdString = idIterator.next().toString();
    108       System.out.println(docIdString);
    109       DocumentID docId   = new DocumentID(docIdString);
    110       DocumentInterface document = documentList.getDocument(docId);
    111       if (document != null) {
    112         documents.add(document);
     96    if (documentIds != null && documentIds.size() > 0) {
     97      List documents = new ArrayList();
     98     
     99      Iterator idIterator = documentIds.iterator();
     100      while (idIterator.hasNext()) {
     101        String docIdString = idIterator.next().toString();
     102        DocumentID docId   = new DocumentID(docIdString);
     103        DocumentInterface document = documentList.getDocument(docId);
     104        if (document != null) {
     105          documents.add(document);
     106        }
     107      }
     108
     109      Iterator docIterator = documents.iterator();
     110      while (docIterator.hasNext()) {
     111        DocumentInterface document = (DocumentInterface) docIterator.next();
     112       
     113        // Post to document
     114        // TODO: tailor this to posting documents to *sections* as required...
     115        document.addDocumentMetadata(new MetadataLabel(this.label), this.value.toString());
    113116      }
    114117    }
    115118
    116     // Next, split the row into the separate metadata items
    117     int entryNo = 0;
    118     while (this.hasMore()) {
    119       String item = this.getEntry(true);
    120       if (item == null || item.length() == 0) {
    121         entryNo ++;
    122         continue;
    123       }
    124 
    125       String label = null;
    126       if (item.startsWith("<")) {
    127         int labelEnd = item.indexOf('>');
    128         if (labelEnd >= 0) {
    129           label = item.substring(1, labelEnd);
    130 
    131           item = item.substring(labelEnd+1, item.length());
    132 
    133           // eliminate any weird whitespace
    134           item.trim();
    135 
    136           // cope with a solo 'item' label with no following string
    137           if (item.length() == 0) {
    138         entryNo ++;
    139         continue;
    140           }
    141         }
    142         // starts with a bracketed label
    143       }
    144       else if (entryNo < this.labels.size()) {
    145         label = (String) this.labels.get(entryNo);
    146       }
    147 
    148       // Actually post the metadata -
    149       // it may be good to have cached all the documents that we're going to change
    150       // in order to minimise rewrites...
    151       if (label != null) {
    152         Iterator docIterator = documents.iterator();
    153         while (docIterator.hasNext()) {
    154           DocumentInterface document = (DocumentInterface) docIterator.next();
    155 
    156           // Post to document
    157           // TODO: tailor this to posting documents to *sections* as required...
    158           document.addDocumentMetadata(new MetadataLabel(label), item);
    159           System.out.println("Posting " + label + "=" + item + " to " + parentDir + filePattern);
    160         }
    161       }
    162       entryNo ++;
    163     }
    164    
    165     // write out the modified documents
    166     // TODO: nicer/more generalised interface for this and related activity in
    167     //       extractor manager (actually, enricher manager);
    168     Iterator docIterator = documents.iterator();
    169     while (docIterator.hasNext()) {
    170       DocumentInterface document = (DocumentInterface) docIterator.next();
    171 
    172       documentList.modifiedDocument(document);
    173     }
    174       }
    175     }
    176 
     119    // flatten the metadata items again...
     120        this.value = null;
     121    this.label = null;
     122      }
     123    }
     124
     125    public void characters(char c[], int start, int length)
     126    { if (this.value != null)
     127      { String string = new String(c, start, length);
     128        this.value.append(string);
     129      }
     130    }
     131
     132    public void setUrl(URL url)
     133    { this.url = url;
     134    }
    177135  }
    178136
     
    207165
    208166  /**
    209    *  Process the document - for a GML document, this results in the
     167   *  Process the document - for a metadata document, this results in the
    210168   *  decoration of other files, for other documents, it does nothing.
    211169   */
    212170  public void extractDocument(DocumentID docID, DocumentInterface document)
    213   { if (document.getDocumentType().equals(IndexDocument.INDEX_DOCUMENT_TYPE))
    214     { // Extract the content from the index file
    215      
    216       // get the file
    217       String documentText =
    218     DocumentLoader.getAsString(document.getDocumentFiles().getFile(0).getURL());
    219    
    220       if (documentText == null) {
    221     System.err.println("MetaXMLExtractor: Unable to load any content for " + document.getDocumentFiles().getFile(0).toString());
     171  { if (document.getDocumentType().equals(MetadataDocument.METADATA_DOCUMENT_TYPE))
     172    { // Extract the content from the metadata file
     173      URL url;
     174
     175      try {
     176    SAXParser parser = new SAXParser();
     177    MetadataHandler handler = new MetadataHandler(this.documentList);
     178        /*
     179    XMLReader reader = XMLReaderFactory.createXMLReader();
     180    reader.setContentHandler(handler);
     181    reader.setErrorHandler(handler);*/
     182    parser.setContentHandler(handler);
     183     
     184    // Get path of file; we cheat here by assuming that the url is a file - this
     185    // really ought to be done better [TODO: fix to handle full paths & URLs]
     186    url = document.getDocumentFiles().getFile(0).getURL();
     187    String filePath = url.getPath();
     188    handler.setUrl(new URL(url, "."));
     189
     190    // A metadata document consists of one file only - get it from the 'default'
     191    // file group
     192    /*
     193    FileReader fileReader = new FileReader(filePath);
     194    reader.parse(new InputSource(fileReader));
     195    */
     196    parser.parse(filePath);
     197      }
     198      catch (SAXException saxException)
     199      { // TODO: log error
     200    System.err.println(saxException);
     201      }
     202      catch (java.io.FileNotFoundException fileException)
     203      { System.err.println(fileException);
     204      }
     205      catch (java.io.IOException ioException)
     206      { System.err.println(ioException);
     207      }
     208      /*      catch (java.net.MalformedURLException malEx) {
     209    System.err.println("Unable to get parent of URL "+url.toString()+" in metadata extraction.");
    222210    return;
    223211      }
    224 
    225       try {
    226     IndexHandler handler = new IndexHandler(documentText, document.getDocumentFiles().getFile(0).getURL(), this.documentList);
    227       }
    228       catch (IndexHandlerException ex) {
    229       }
    230      
     212      */
     213
    231214      // for each document post it to the corresponding document
    232215    }
    233216  }
    234217
    235   protected static void postMetadata(String file, String value, String label)
    236   {
     218  protected static void postMetadata(URL url, List files, String label, String value, boolean accumulate)
     219  { String file;
     220
     221    Iterator fileIter = files.iterator();
     222    while (fileIter.hasNext()) {
     223      file = fileIter.next().toString();
     224
     225      System.out.println(url.toString() + " " + file + ": " + label + "=" + value);
     226    }
    237227  }
    238228
     
    254244  }
    255245}
     246
     247
     248
     249
Note: See TracChangeset for help on using the changeset viewer.