Changeset 6284


Ignore:
Timestamp:
2003-12-17T13:08:02+13:00 (20 years ago)
Author:
cs025
Message:

Added HTMLDocumentTools, also modifications to the abstract interfaces
and the HTMLDocument doctype to support indexing by section.

Location:
trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes
Files:
1 added
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/AbstractDocument.java

    r6101 r6284  
    1313
    1414import org.greenstone.gsdl3.gs3build.metadata.NamespaceFactory;
     15import org.greenstone.gsdl3.gs3build.metadata.StructureIdentifierFactory;
    1516import org.greenstone.gsdl3.gs3build.metadata.GSDL3Namespace;
    1617import org.greenstone.gsdl3.gs3build.metadata.METSDescriptiveSet;
     
    2223import org.greenstone.gsdl3.gs3build.metadata.METSDivision;
    2324import org.greenstone.gsdl3.gs3build.metadata.METSNamespace;
     25import org.greenstone.gsdl3.gs3build.metadata.MetadataLabel;
    2426
    2527import org.greenstone.gsdl3.gs3build.util.MultiMap;
     
    3941  DocumentID       id;
    4042  boolean          isModified;
     43  StructureIdentifierFactory structureIdFactory;
    4144
    4245  /**
     
    5356    this.structureSet = new METSStructureSet();
    5457    this.id           = id;
     58    this.structureIdFactory = new StructureIdentifierFactory();
    5559  }
    5660 
     
    152156   *   @see DocumentInterface:addDocumentMetadata
    153157   */
    154   public void addDocumentMetadata(String name, String value)
    155   { int colonAt = name.indexOf(":");
    156     String namespace;
    157        
    158     if (colonAt > 0) {
    159       namespace = name.substring(0, colonAt);
    160       name = name.substring(colonAt+1);
    161     }
    162     else {
    163       namespace = GSDL3Namespace.GSDL3_NAMESPACE_ID;
    164     }
    165 
    166     // no need to set isModified, as the following call will do it anyway!
    167     this.addDocumentMetadata(namespace, name, value);
     158  public void addDocumentMetadata(MetadataLabel label, String value)
     159  { // no need to set isModified, as the following call will do it anyway!
     160    this.addDocumentMetadata(label.getNamespace(), label.getLabel(), value);
    168161  }
    169162
     
    286279  { this.fileSet = fileSet;
    287280  }
     281
     282  /**
     283   *  This is just a dummy function that does nothing at this level...
     284   */
     285  public org.w3c.dom.Document getDOMDocument()
     286  { return null;
     287  }
    288288 
    289289  /**
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/DocumentInterface.java

    r6101 r6284  
    5757   */
    5858  public String getDocumentText();
     59
     60  /**
     61   *  The document as a dom object
     62   */
     63  public org.w3c.dom.Document getDOMDocument();
    5964
    6065  /**
     
    113118   *  the label.
    114119   *
    115    *  @param <code>String</code> label of the metadata, with a '.' to deliminate
     120   *  @param <code>MetadataLabel</code> label of the metadata, with a '.' to deliminate
    116121   *                             sub-component structures.  The label may commence
    117122   *                             with a namespace followed by a colon.
     
    119124   *  @param <code>String</code> value of the metadata
    120125   */
    121   public void addDocumentMetadata(String label, String value);
     126  public void addDocumentMetadata(MetadataLabel label, String value);
    122127 
    123128  /**
     
    132137  public void addDocumentMetadata(String namespace, String label, String value);
    133138
    134  
    135139  /**
    136140   *  Post metadata to a file in this document - the appropriate changes
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/HTMLDocument.java

    r6101 r6284  
    99import java.util.ArrayList;
    1010
    11 import org.greenstone.gsdl3.gs3build.metadata.METSFile;
     11import org.greenstone.gsdl3.gs3build.metadata.*;
    1212import org.greenstone.gsdl3.gs3build.util.*;
     13import org.greenstone.gsdl3.gs3build.xpointer.XPointer;
     14
     15import org.w3c.dom.*;
    1316
    1417public class HTMLDocument extends AbstractDocument
    1518{
    1619  public static final String HTML_DOCUMENT_TYPE = "HTML";
     20  Document domDocument;
    1721
    1822  public HTMLDocument(DocumentID id)
    1923  { super(id);
     24    this.domDocument = null;
    2025  }
    2126
     
    2934  { super(url);
    3035
    31     HTMLDoc htmlDoc;
     36    this.loadDocument(url);
     37
     38    this._extractDocumentFiles();
     39    this._extractDocumentMetadata();
     40
     41    HTMLDocumentTools docTools = new HTMLDocumentTools(this.domDocument);
     42    docTools.setMetsDocument(this);
     43    docTools.setUrl(this.fileSet.getFile(0).getLocation());
     44    docTools.findSections();
     45    METSStructure sectionStruct = docTools.getStructure();
     46
     47    if (sectionStruct.size() > 0) {
     48      METSStructureSet structureSet = this.getDocumentStructure();
     49      structureSet.addStructure(sectionStruct);
     50    }
     51  }
     52
     53  private void loadDocument(URL url)
     54  {
     55    //    HTMLDoc htmlDoc;
     56    HTMLTidy tidyDoc;
    3257    if (url.toString().startsWith("file://"))
    33     { htmlDoc = new HTMLDoc(url, url.toString().substring(7));
     58    { //htmlDoc = new HTMLDoc(url, url.toString().substring(7));
     59      tidyDoc = new HTMLTidy(new File(url.toString().substring(7)));
    3460    }
    3561    else if (url.toString().startsWith("file:/"))
    36     { htmlDoc = new HTMLDoc(url, url.toString().substring(5));
     62    { //htmlDoc = new HTMLDoc(url, url.toString().substring(5));
     63      tidyDoc = new HTMLTidy(new File( url.toString().substring(5)));
    3764    }
    3865    else
    39     { htmlDoc = new HTMLDoc(url);
     66    { //htmlDoc = new HTMLDoc(url);
     67      tidyDoc = new HTMLTidy(url);
    4068    }
    4169   
    42     this._extractDocumentFiles(htmlDoc);
    43     this._extractDocumentMetadata(htmlDoc);
    44   }
    45 
    46   private void _extractDocumentMetadata(HTMLDoc htmlDoc)
    47   { HTMLBlock codedContent = htmlDoc.getCodedContent();
     70    this.domDocument = tidyDoc.getDocument();
     71 }
     72
     73  private void _extractDocumentMetadata()
     74  {
     75    NodeList metadata = this.domDocument.getElementsByTagName("META");
     76    for (int n = 0; n < metadata.getLength(); n ++) {
     77      Node node = metadata.item(n);
     78      Element element = (Element) node;
     79
     80      String name = element.getAttribute("name");
     81      if (name == null || name.length() == 0) {
     82    continue;
     83      }
     84
     85      String value = element.getAttribute("content");
     86      if (value == null || value.length() == 0) {
     87    value = name;
     88      }
     89
     90      this.addDocumentMetadata(new MetadataLabel(name), value);
     91    }
     92   
     93    NodeList titles = this.domDocument.getElementsByTagName("TITLE");
     94    StringBuffer title = new StringBuffer();
     95    for (int n = 0; n < titles.getLength(); n ++) {
     96      Node node = titles.item(n);
     97      Element element = (Element) node;
     98
     99      NodeList childNodes = node.getChildNodes();
     100      for (int c = 0; c < childNodes.getLength(); c ++) {
     101    Node child = childNodes.item(c);
     102    if (child.getNodeType() == org.w3c.dom.Node.TEXT_NODE) {
     103      title.append(child.getNodeValue());
     104    }
     105      }
     106    }
     107
     108    if (title.length() > 0)
     109    { this.addDocumentMetadata(new MetadataLabel("title"), title.toString());
     110    }
     111
     112    /* Old HTMLDoc based parsing...
     113    HTMLBlock codedContent = htmlDoc.getCodedContent();
    48114    boolean inTitle = false;
    49     StringBuffer title = new StringBuffer();
     115    title = new StringBuffer();
    50116
    51117    for (int e = 0; e < codedContent.size(); e ++)
     
    62128      // get the value, if it exists
    63129      String value = tag.idValue("content");
    64       if (value != null && value.length() > 0) {
    65         System.out.println("  " + value);
    66       }
     130
    67131      // if value does not exist, default it to being the same
    68132      // as the name.
    69       else {
     133      if (value == null || value.length() > 0) {
    70134        value = name;
    71135      }
    72 
    73       this.addDocumentMetadata(name, value);
    74     }
    75     else if (tag.tagName().equals("title"))
     136    }
     137    else
     138    if (tag.tagName().equals("title"))
    76139    { inTitle = true;
    77140    }
     
    92155      }
    93156    }
    94     if (title.length() > 0)
    95     { this.addDocumentMetadata("title", title.toString());
    96     }
    97   }
    98 
    99   private void _extractDocumentFiles(HTMLDoc htmlDoc)
     157      */
     158  }
     159
     160  private void _extractDocumentFiles()
    100161  { URL homeUrl = this.fileSet.getFile(0).getLocation();
    101162
     163    NodeList metadata = this.domDocument.getElementsByTagName("img");
     164    for (int n = 0; n < metadata.getLength(); n ++) {
     165      Node node = metadata.item(n);
     166      Element element = (Element) node;
     167
     168      String location = element.getAttribute("src");
     169      if (location == null || location.length() == 0) {
     170    System.out.println("No name");
     171    continue;
     172      }
     173
     174      try
     175      { // make the url for the image, and then add it to the document list of
     176    //
     177    URL imgUrl = new URL(homeUrl, location);
     178    METSFile file = this.fileSet.addFile(imgUrl);
     179    this.structureSet.getStructure("All").getDivision("All").addFileReference(file.getID());
     180      }
     181      catch (MalformedURLException ex)
     182      { // TODO: report exception/failure to resolve...
     183      }
     184    }
     185
     186    /**   
    102187    HTMLBlock codedContent = htmlDoc.getCodedContent();
    103188    for (int e = 0; e < codedContent.size(); e ++)
     
    121206      }
    122207    }
     208    */
    123209  }
    124210
     
    132218    URL     url =(URL) this.fileSet.getFile(0).getLocation();
    133219
     220    this.getSectionText("1");
     221
    134222    if (url.toString().startsWith("file://"))
    135223    { htmlDoc = new HTMLDoc(url, url.toString().substring(7));
     
    144232  }
    145233
    146   public String getSectionText(String document)
    147   { return "";
     234  public Document getDOMDocument()
     235  {
     236    if (this.domDocument == null) {
     237      URL     url =(URL) this.fileSet.getFile(0).getLocation();
     238      this.loadDocument(url);
     239    }
     240    return this.domDocument;
     241  }
     242
     243  private XPointer getXPointer(METSDivision division)
     244  { String fileId = division.getDefaultFileReference();
     245
     246    if (fileId == null) {
     247      System.err.println("Unable to obtain file reference for section " + division.getLabel());
     248      return null;
     249    }
     250
     251    METSFileGroup fileGroup = this.getDocumentFiles().getGroup(fileId);
     252    if (fileGroup == null) {
     253      System.err.println("Unable to obtain file reference for filegroup " + fileId);
     254      return null;
     255    }
     256
     257    METSFile file = fileGroup.getFile(0);
     258    if (file == null) {
     259      System.err.println("Unable to obtain any files within filegroup " + fileId);
     260      return null;
     261    }
     262
     263    URL url = file.getLocation();
     264    XPointer xpointer = XPointer.processXPointer(this.domDocument, url);
     265
     266    return xpointer;
     267  }
     268
     269  private XPointer getXPointer(String sectionId)
     270  { if (this.domDocument == null) {
     271      URL     url =(URL) this.fileSet.getFile(0).getLocation();
     272      this.loadDocument(url);
     273    }
     274
     275    METSDivision division =
     276      this.getDocumentStructure().getDivision(METSStructureSet.GSDL3_SECTION_STRUCTURE,
     277                          sectionId);
     278    if (division == null) {
     279      //      System.err.println("Unable to locate section " + sectionId);
     280      return null;
     281    }
     282
     283    return this.getXPointer(division);
     284  }
     285 
     286  public Node getSectionStartNode(METSDivision division)
     287  { XPointer xpointer = this.getXPointer(division);
     288    return xpointer.getStartNode();
     289  }
     290
     291  public Node getSectionStartNode(String sectionId)
     292  { XPointer xpointer = this.getXPointer(sectionId);
     293
     294    return xpointer.getStartNode();
     295  }
     296
     297  public String getSectionText(String sectionId)
     298  { XPointer xpointer = this.getXPointer(sectionId);
     299
     300    if (xpointer == null) {
     301      return "";
     302    }
     303
     304    // get the XML content of the xpointer...
     305    return xpointer.toString();
    148306  }
    149307}
Note: See TracChangeset for help on using the changeset viewer.