Context Navigation

← Previous Changeset
Next Changeset →

Changeset 6453

Timestamp:

2004-01-12T15:55:11+13:00 (20 years ago)

Author:

cs025

Message:

Extended extractor manager slightly - implemented metadata.xml support

Location:

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor

Files:

: 3 edited

ExtractorManager.java (modified) (1 diff)
GMLExtractor.java (modified) (1 diff)
MetaXMLExtractor.java (modified) (6 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/ExtractorManager.java

r6013	r6453
9	9	public class ExtractorManager
10	10	{
	11	public static final String ACCUMULATE_MODE = "accumulate";
	12
11	13	DocumentList documents;
12	14	ExtractorInterface [] list;

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/GMLExtractor.java

r6289	r6453
54	54
55	55	String mode = attributes.getValue("mode");
56		this.accumulate = mode.equals(ACCUMULATE_MODE);
	56	this.accumulate = mode.equals(ExtractorManager.ACCUMULATE_MODE);
57	57	}
58	58	}

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/extractor/MetaXMLExtractor.java

-              r6289
+              r6453
 import org.greenstone.gsdl3.gs3build.util.GS2TextFileHandler;
+import org.apache.xerces.parsers.SAXParser;
 import org.xml.sax.XMLReader;
 import org.xml.sax.InputSource;
 …
 import org.greenstone.gsdl3.gs3build.doctypes.DocumentID;
 import org.greenstone.gsdl3.gs3build.doctypes.DocumentInterface;
 import org.greenstone.gsdl3.gs3build.doctypes.IndexDocument;
+import org.greenstone.gsdl3.gs3build.doctypes.MetadataDocument;
 import org.greenstone.gsdl3.gs3build.doctypes.DocumentLoader;
 import org.greenstone.gsdl3.gs3build.doctypes.DocumentList;
 …
 public class MetaXMLExtractor implements ExtractorInterface
+{
+  class IndexHandlerException extends Exception
+  { public IndexHandlerException(String value)
+    { super(value);
+    }
+  }
+  /**
+   *  An inner class to handle GML files
+   */
+  class IndexHandler extends GS2TextFileHandler
+  { List    labels;
+    URL     base;
+    IndexHandler(String content, URL url, DocumentList documentList) throws IndexHandlerException
+    {
+      super(content);
+      this.labels  = new ArrayList();
+      this.base    = url;
+      String parentDir;
+      int leaf = this.base.toString().lastIndexOf('/');
+      if (leaf >= 0) {
+    parentDir = this.base.toString().substring(0, leaf+1);
+      }
+      else {
+    parentDir = this.base.toString();
+      }
+      // get the first line
+      this.getLine();
+      if (!this.hasMore())
+      { throw new IndexHandlerException("No title line");
+      }
+      // get the first totem - it should be "key:"
+      String entry = this.getEntry(true);
+      // now get all the labels
+      while (this.hasMore())
+      { String label = this.getEntry(true);
+        if (label == null || label.length() == 0) {
+      continue;
+    }
+  /**
+   *  An inner class to handle Metadata files
+   */
+  class MetadataHandler extends DefaultHandler
+  { List         files;
+    String       label;
+    StringBuffer value;
+    URL          url;
+    boolean      inElement;
+    boolean      accumulate;
+    DocumentList documentList;
+    MetadataHandler(DocumentList documentList)
+    { super();
+      this.label = null;
+      this.value = null;
+      this.documentList = documentList;
+    }
+    public void startElement(String URI, String localName, String qName, Attributes attributes)
+    { if (localName.equals("FileName"))
+      { this.value = new StringBuffer();
+      }
+      else if (localName.equals("FileSet"))
+      { this.files = new ArrayList();
+      }
+      else if (localName.equals("Description"))
+      {
+      }
+      else if (localName.equals("Metadata"))
+      { this.label = attributes.getValue("name");
+        this.value = new StringBuffer();
+    String mode = attributes.getValue("mode");
+    this.accumulate = mode.equals(ExtractorManager.ACCUMULATE_MODE);
+      }
+    }
+    public void endElement(String URI, String localName, String qName)
+    { if (localName.equals("FileName"))
+      { String file = this.value.toString();
+        this.value = null;
+    this.files.add(file);
+      }
+      else if (localName.equals("FileSet"))
+      { // post the existing files item...
+      }
+      else if (localName.equals("Description"))
+      {
+      }
+      else if (localName.equals("Metadata"))
+      { List documentIds;
+        documentIds = this.documentList.findDocumentIdsUsingFiles(this.files, this.url.toString());
+    this.labels.add(label);
+    System.out.println("Adding label: " + label);
+      }
+      while (this.hasMoreLines()) {
+    this.getLine();
+    // Get the file pattern itself
+    String filePattern = this.getEntry(true);
+    if (filePattern == null || filePattern.length() == 0) {
+      continue;
+    }
+    // get a list of documents that match the file pattern
+    List documentIds = documentList.findDocumentIdsUsingFile(filePattern);
+    MetaXMLExtractor.postMetadata(this.url, this.files,
+                      this.label, this.value.toString(),
+                      this.accumulate);
     if (documentIds != null) {
       Iterator iterator = documentIds.iterator();
 …
+    }
+    // if no files match this data, then skip this row
+    // TODO: raise a quality error message
+    if (documentIds == null || documentIds.size() == 0) {
+      continue;
+    }
+    // cache up the documents that match for speed improvements...
+    List documents = new ArrayList();
+    Iterator idIterator = documentIds.iterator();
+    while (idIterator.hasNext()) {
+      String docIdString = idIterator.next().toString();
+      System.out.println(docIdString);
+      DocumentID docId   = new DocumentID(docIdString);
+      DocumentInterface document = documentList.getDocument(docId);
+      if (document != null) {
+        documents.add(document);
+    if (documentIds != null && documentIds.size() > 0) {
+      List documents = new ArrayList();
+      Iterator idIterator = documentIds.iterator();
+      while (idIterator.hasNext()) {
+        String docIdString = idIterator.next().toString();
+        DocumentID docId   = new DocumentID(docIdString);
+        DocumentInterface document = documentList.getDocument(docId);
+        if (document != null) {
+          documents.add(document);
+        }
+      }
+      Iterator docIterator = documents.iterator();
+      while (docIterator.hasNext()) {
+        DocumentInterface document = (DocumentInterface) docIterator.next();
+        // Post to document
+        // TODO: tailor this to posting documents to *sections* as required...
+        document.addDocumentMetadata(new MetadataLabel(this.label), this.value.toString());
+      }
+    }
+    // Next, split the row into the separate metadata items
+    int entryNo = 0;
+    while (this.hasMore()) {
+      String item = this.getEntry(true);
+      if (item == null || item.length() == 0) {
+        entryNo ++;
+        continue;
+      }
+      String label = null;
+      if (item.startsWith("<")) {
+        int labelEnd = item.indexOf('>');
+        if (labelEnd >= 0) {
+          label = item.substring(1, labelEnd);
+          item = item.substring(labelEnd+1, item.length());
+          // eliminate any weird whitespace
+          item.trim();
+          // cope with a solo 'item' label with no following string
+          if (item.length() == 0) {
+        entryNo ++;
+        continue;
+          }
+        }
+        // starts with a bracketed label
+      }
+      else if (entryNo < this.labels.size()) {
+        label = (String) this.labels.get(entryNo);
+      }
+      // Actually post the metadata -
+      // it may be good to have cached all the documents that we're going to change
+      // in order to minimise rewrites...
+      if (label != null) {
+        Iterator docIterator = documents.iterator();
+        while (docIterator.hasNext()) {
+          DocumentInterface document = (DocumentInterface) docIterator.next();
+          // Post to document
+          // TODO: tailor this to posting documents to *sections* as required...
+          document.addDocumentMetadata(new MetadataLabel(label), item);
+          System.out.println("Posting " + label + "=" + item + " to " + parentDir + filePattern);
+        }
+      }
+      entryNo ++;
+    }
+    // write out the modified documents
+    // TODO: nicer/more generalised interface for this and related activity in
+    //       extractor manager (actually, enricher manager);
+    Iterator docIterator = documents.iterator();
+    while (docIterator.hasNext()) {
+      DocumentInterface document = (DocumentInterface) docIterator.next();
+      documentList.modifiedDocument(document);
+    }
+      }
+    }
+    // flatten the metadata items again...
+        this.value = null;
+    this.label = null;
+      }
+    }
+    public void characters(char c[], int start, int length)
+    { if (this.value != null)
+      { String string = new String(c, start, length);
+        this.value.append(string);
+      }
+    }
+    public void setUrl(URL url)
+    { this.url = url;
+    }
+  }
 …
   /**
    *  Process the document - for a GML document, this results in the
+   *  Process the document - for a metadata document, this results in the
    *  decoration of other files, for other documents, it does nothing.
    */
   public void extractDocument(DocumentID docID, DocumentInterface document)
+  { if (document.getDocumentType().equals(IndexDocument.INDEX_DOCUMENT_TYPE))
+    { // Extract the content from the index file
+      // get the file
+      String documentText =
+    DocumentLoader.getAsString(document.getDocumentFiles().getFile(0).getURL());
+      if (documentText == null) {
+    System.err.println("MetaXMLExtractor: Unable to load any content for " + document.getDocumentFiles().getFile(0).toString());
+  { if (document.getDocumentType().equals(MetadataDocument.METADATA_DOCUMENT_TYPE))
+    { // Extract the content from the metadata file
+      URL url;
+      try {
+    SAXParser parser = new SAXParser();
+    MetadataHandler handler = new MetadataHandler(this.documentList);
+        /*
+    XMLReader reader = XMLReaderFactory.createXMLReader();
+    reader.setContentHandler(handler);
+    reader.setErrorHandler(handler);*/
+    parser.setContentHandler(handler);
+    // Get path of file; we cheat here by assuming that the url is a file - this
+    // really ought to be done better [TODO: fix to handle full paths & URLs]
+    url = document.getDocumentFiles().getFile(0).getURL();
+    String filePath = url.getPath();
+    handler.setUrl(new URL(url, "."));
+    // A metadata document consists of one file only - get it from the 'default'
+    // file group
+    /*
+    FileReader fileReader = new FileReader(filePath);
+    reader.parse(new InputSource(fileReader));
+    */
+    parser.parse(filePath);
+      }
+      catch (SAXException saxException)
+      { // TODO: log error
+    System.err.println(saxException);
+      }
+      catch (java.io.FileNotFoundException fileException)
+      { System.err.println(fileException);
+      }
+      catch (java.io.IOException ioException)
+      { System.err.println(ioException);
+      }
+      /*      catch (java.net.MalformedURLException malEx) {
+    System.err.println("Unable to get parent of URL "+url.toString()+" in metadata extraction.");
     return;
+      }
+      try {
+    IndexHandler handler = new IndexHandler(documentText, document.getDocumentFiles().getFile(0).getURL(), this.documentList);
+      }
+      catch (IndexHandlerException ex) {
+      }
+      */
       // for each document post it to the corresponding document
+    }
+  }
+  protected static void postMetadata(String file, String value, String label)
+  {
+  protected static void postMetadata(URL url, List files, String label, String value, boolean accumulate)
+  { String file;
+    Iterator fileIter = files.iterator();
+    while (fileIter.hasNext()) {
+      file = fileIter.next().toString();
+      System.out.println(url.toString() + " " + file + ": " + label + "=" + value);
+    }
+  }
 …
+  }
+}

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats: