Context Navigation

← Previous Change
Next Change →

HTMLDocument.java

Timestamp:

2003-11-24T14:26:35+13:00 (20 years ago)

Author:

cs025

Message:

Index document type, metadata extensions

File:

: 1 edited

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/HTMLDocument.java (modified) (1 diff)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/HTMLDocument.java

-              r5800
+              r5944
 public class HTMLDocument extends AbstractDocument
+{
     public static final String HTML_DOCUMENT_TYPE = "HTML";
+  public static final String HTML_DOCUMENT_TYPE = "HTML";
+    /**
+     *  Create the HTMLDocument from a given URL - the URL may in fact be a reference
+     *  to a local file.
+     *
+     *  @param <code>URL</code> The location from which to load the file
+     */
+    public HTMLDocument(URL url)
+    {   super(url);
+  public HTMLDocument(DocumentID id)
+  { super(id);
+  }
+        HTMLDoc htmlDoc;
+        if (url.toString().startsWith("file://"))
+        {   htmlDoc = new HTMLDoc(url, url.toString().substring(7));
+        }
+        else
+        {   htmlDoc = new HTMLDoc(url);
+        }
+  /**
+   *  Create the HTMLDocument from a given URL - the URL may in fact be a reference
+   *  to a local file.
+   *
+   *  @param <code>URL</code> The location from which to load the file
+   */
+  public HTMLDocument(URL url)
+  { super(url);
+        this._extractDocumentFiles(htmlDoc);
+        this._extractDocumentMetadata(htmlDoc);
+    HTMLDoc htmlDoc;
+    if (url.toString().startsWith("file://"))
+    { htmlDoc = new HTMLDoc(url, url.toString().substring(7));
+    }
+    else if (url.toString().startsWith("file:/"))
+    { htmlDoc = new HTMLDoc(url, url.toString().substring(5));
+    }
+    else
+    { htmlDoc = new HTMLDoc(url);
+    }
+    this._extractDocumentFiles(htmlDoc);
+    this._extractDocumentMetadata(htmlDoc);
+  }
+  private void _extractDocumentMetadata(HTMLDoc htmlDoc)
+  { HTMLBlock codedContent = htmlDoc.getCodedContent();
+    boolean inTitle = false;
+    StringBuffer title = new StringBuffer();
+    for (int e = 0; e < codedContent.size(); e ++)
+    { if (codedContent.elementAt(e) instanceof HTMLTag)
+      { HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
+        if (tag.tagName().equals("meta"))
+    { // check that the name of the metadata item exists
+      String name = tag.idValue("name");
+      if (name == null || name.length() == 0) {
+        continue;
+      }
+      // get the value, if it exists
+      String value = tag.idValue("content");
+      if (value != null && value.length() > 0) {
+        System.out.println("  " + value);
+      }
+      // if value does not exist, default it to being the same
+      // as the name.
+      else {
+        value = name;
+      }
+      this.addDocumentMetadata(name, value);
+    }
+    else if (tag.tagName().equals("title"))
+    { inTitle = true;
+    }
+    else if (tag.tagName().equals("/title"))
+    { inTitle = false;
+    }
+    // cut off when real body content appears - not a perfect
+        // implementation, just cheap & cheerful
+    else if (tag.tagName().equals("/head"))
+    { break;
+    }
+    else if (tag.tagName().equals("body"))
+    { break;
+    }
+      }
+      else if (inTitle == true)
+      { title.append(codedContent.elementAt(e).toString());
+      }
+    }
+    if (title.length() > 0)
+    { this.addDocumentMetadata("title", title.toString());
+    }
+  }
+    private void _extractDocumentMetadata(HTMLDoc htmlDoc)
+    {   HTMLBlock codedContent = htmlDoc.getCodedContent();
+        boolean inTitle = false;
+        StringBuffer title = new StringBuffer();
+  private void _extractDocumentFiles(HTMLDoc htmlDoc)
+  { URL homeUrl = this.fileSet.getFile(0).getLocation();
+        for (int e = 0; e < codedContent.size(); e ++)
+        {   if (codedContent.elementAt(e) instanceof HTMLTag)
+            {   HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
+    HTMLBlock codedContent = htmlDoc.getCodedContent();
+    for (int e = 0; e < codedContent.size(); e ++)
+    { if (codedContent.elementAt(e) instanceof HTMLTag)
+      { HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
+                if (tag.tagName().equals("meta"))
+                {   // check that the name of the metadata item exists
+                    String name = tag.idValue("name");
+                    if (name == null || name.length() == 0) {
+                        continue;
+                    }
+        if (tag.tagName().equals("img"))
+    { String location = tag.idValue("src");
+                    // get the value, if it exists
+                    String value = tag.idValue("content");
+                    if (value != null && value.length() > 0) {
+                        System.out.println("  " + value);
+                    }
+                    // if value does not exist, default it to being the same
+                    // as the name.
+                    else {
+                        value = name;
+                    }
+                    this.addDocumentMetadata(name, value);
+                }
+                else if (tag.tagName().equals("title"))
+                { inTitle = true;
+                }
+                else if (tag.tagName().equals("/title"))
+                {   inTitle = false;
+                }
+                // cut off when real body content appears - not a perfect
+                // implementation, just cheap & cheerful
+                else if (tag.tagName().equals("/head"))
+                {   break;
+                }
+                else if (tag.tagName().equals("body"))
+                {   break;
+                }
+            }
+            else if (inTitle == true)
+            { title.append(codedContent.elementAt(e).toString());
+            }
+        }
+        if (title.length() > 0)
+        { this.addDocumentMetadata("title", title.toString());
+        }
+    }
+    private void _extractDocumentFiles(HTMLDoc htmlDoc)
+    { URL homeUrl = this.fileSet.getFile(0).getLocation();
+      HTMLBlock codedContent = htmlDoc.getCodedContent();
+      for (int e = 0; e < codedContent.size(); e ++)
+      { if (codedContent.elementAt(e) instanceof HTMLTag)
+    { HTMLTag tag = (HTMLTag) codedContent.elementAt(e);
+      if (tag.tagName().equals("img"))
+      { String location = tag.idValue("src");
+        try
+        { // make the url for the image, and then add it to the document list of
+          //
+          URL imgUrl = new URL(homeUrl, location);
+          METSFile file = this.fileSet.addFile(imgUrl);
+          this.structureSet.getStructure("All").getDivision("All").addFileReference(file.getID());
+        }
+        catch (MalformedURLException ex)
+        { // TODO: report exception/failure to resolve...
+        }
+      try
+      { // make the url for the image, and then add it to the document list of
+        //
+        URL imgUrl = new URL(homeUrl, location);
+        METSFile file = this.fileSet.addFile(imgUrl);
+        this.structureSet.getStructure("All").getDivision("All").addFileReference(file.getID());
+      }
+      catch (MalformedURLException ex)
+      { // TODO: report exception/failure to resolve...
+      }
+    }
+      }
+    }
+  }
+    public String getDocumentType()
+    {   return HTML_DOCUMENT_TYPE;
+  public String getDocumentType()
+  { return HTML_DOCUMENT_TYPE;
+  }
+  public String getDocumentText()
+  {
+    HTMLDoc htmlDoc;
+    URL     url =(URL) this.fileSet.getFile(0).getLocation();
+    if (url.toString().startsWith("file://"))
+    { htmlDoc = new HTMLDoc(url, url.toString().substring(7));
+    }
+    public String getDocumentText()
+    { HTMLDoc htmlDoc = new HTMLDoc((URL) this.fileSet.getFile(0).getLocation(), this.fileSet.getFile(0).toString().substring(7));
+      return htmlDoc.getContent();
+    else if (url.toString().startsWith("file:/"))
+    { htmlDoc = new HTMLDoc(url, url.toString().substring(5));
+    }
+    else
+    { htmlDoc = new HTMLDoc(url);
+    }
+    return htmlDoc.getContent();
+  }
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 5944 for trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/HTMLDocument.java

Legend:

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/doctypes/HTMLDocument.java

Download in other formats: