Context Navigation

← Previous Changeset
Next Changeset →

Changeset 8710

Timestamp:

2004-11-30T16:18:58+13:00 (19 years ago)

Author:

chi

Message:

Change program layout

Location:

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/util

Files:

: 2 edited

HTMLParser.java (modified) (1 diff)
HTMLTidy.java (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/util/HTMLParser.java

-              r5947
+              r8710
 public class HTMLParser
+{ HTMLDoc document;
+    int         pos;
+    int         lastpos;
+    public HTMLParser(HTMLDoc document)
+    { this.document = document;
+        this.pos = 0;
+    }
+    public void startParse()
+    { this.pos = 0;
+        this.lastpos = -1;
+//      System.out.println("Starting "+document.urlString());
+    }
+    public int atParse()
+    { return this.pos;
+    }
+    public int lastParse()
+    { return this.lastpos;
+    }
+  // do a full text/tag parse
+  public String fullParse()
+  { String reply = null;
+    int      end;
+    int      start;
+    this.lastpos = this.pos;
+    if (this.pos >= this.document.getContent().length())
+    { return reply;
+    }
+    if (this.document.getContent().charAt(this.pos) == '<')
+    {
+      start = this.pos;
+      // if we're not at the end of the document,
+      // read the rest of the tag
+      if (this.pos == this.document.getContent().length() - 1)
+      { this.pos = this.document.getContent().length();
+        return reply;
+      }
+      // if the tag is a comment
+      if (this.pos < this.document.getContent().length() - 3 &&
+      this.document.getContent().substring(this.pos, this.pos+4).equals("<!--"))
+      { end = this.document.getContent().substring(this.pos).indexOf("-->") + 3 + this.pos;
+        reply = this.document.getContent().substring(this.pos, end);
+        this.pos = end;
+      }
+      else
+      { // read up to the end of the tag
+    end = this.pos + 1;
+    while (end < this.document.getContent().length() &&
+           this.document.getContent().charAt(end) != '>')
+    { end ++;
+    }
+    // get the whole of the tag into 'reply', and
+    // set the current pos to immediately after the tag
+    if (end < this.document.getContent().length())
+    { reply = this.document.getContent().substring(this.pos, end + 1);
+      this.pos = end + 1;
+    }
+    // patch the trailing > onto the tag string
+{
+    HTMLDoc document;
+    int pos;
+    int lastpos;
+    public HTMLParser(HTMLDoc document)
+    {
+    this.document = document;
+    this.pos = 0;
+    }
+    public void startParse()
+    {
+    this.pos = 0;
+    this.lastpos = -1;
+    //System.out.println("Starting "+document.urlString());
+    }
+    public int atParse()
+    {
+    return this.pos;
+    }
+    public int lastParse()
+    {
+    return this.lastpos;
+    }
+    // do a full text/tag parse
+    public String fullParse()
+    {
+    String reply = null;
+    int end;
+    int start;
+    this.lastpos = this.pos;
+    if (this.pos >= this.document.getContent().length()){
+        return reply;
+    }
+    if (this.document.getContent().charAt(this.pos) == '<'){
+        start = this.pos;
+        // if we're not at the end of the document,
+        // read the rest of the tag
+        if (this.pos == this.document.getContent().length() - 1){
+        this.pos = this.document.getContent().length();
+        return reply;
+        }
+        // if the tag is a comment
+        if (this.pos < this.document.getContent().length() - 3 &&
+        this.document.getContent().substring(this.pos, this.pos+4).equals("<!--")){
+        end = this.document.getContent().substring(this.pos).indexOf("-->") + 3 + this.pos;
+        reply = this.document.getContent().substring(this.pos, end);
+        this.pos = end;
+        }
+        else
+        { // read up to the end of the tag
+            end = this.pos + 1;
+            while (end < this.document.getContent().length() &&
+               this.document.getContent().charAt(end) != '>'){
+            end ++;
+            }
+            // get the whole of the tag into 'reply', and
+            // set the current pos to immediately after the tag
+            if (end < this.document.getContent().length()){
+            reply = this.document.getContent().substring(this.pos, end + 1);
+            this.pos = end + 1;
+            }
+            // patch the trailing > onto the tag string
+            else {
+            reply = this.document.getContent().substring(this.pos, end) + ">";
+            this.pos = end;
+            }
+        }
+    }
     else
+    { reply = this.document.getContent().substring(this.pos, end) + ">";
+      this.pos = end;
+    }
+      }
+    }
+    else
+    { // hunt for the beginning of the next tag
+      start = this.pos;
+      while ((this.pos < this.document.getContent().length()) &&
+         (this.document.getContent().charAt(this.pos) != '<'))
+      { this.pos ++;
+      }
+      // return everything up to that tag
+      reply = this.document.getContent().substring(start, this.pos);
+    }
+    return reply;
+  }
+    // Get the next tag to parse
+    public String nextParse()
+    { String reply  = null;
+        int      end;
+        this.lastpos    = this.pos;
+        if (this.document.getContent() == null)
+        {   return null;
+        }
+        if (this.pos >= this.document.getContent().length())
+        { return reply;
+        }
+        // hunt for the beginning of the next tag
+        { // hunt for the beginning of the next tag
+        start = this.pos;
         while ((this.pos < this.document.getContent().length()) &&
+                        (this.document.getContent().charAt(this.pos) != '<'))
+        { this.pos ++;
+        }
+        // if we're not at the end of the document,
+        // read the rest of the tag
+        if (this.pos < this.document.getContent().length())
+        { if (this.pos == this.document.getContent().length() - 1)
+            { this.pos = this.document.getContent().length();
+              return reply;
+               (this.document.getContent().charAt(this.pos) != '<')){
+            this.pos ++;
+        }
+        // return everything up to that tag
+        reply = this.document.getContent().substring(start, this.pos);
+        }
+    return reply;
+    }
+    // Get the next tag to parse
+    public String nextParse()
+    {
+    String reply = null;
+    int end;
+    this.lastpos = this.pos;
+    if (this.document.getContent() == null){
+        return null;
+    }
+    if (this.pos >= this.document.getContent().length()){
+        return reply;
+    }
+    // hunt for the beginning of the next tag
+    while ((this.pos < this.document.getContent().length()) &&
+           (this.document.getContent().charAt(this.pos) != '<')){
+        this.pos ++;
+    }
+    // if we're not at the end of the document,
+    // read the rest of the tag
+    if (this.pos < this.document.getContent().length()){
+        if (this.pos == this.document.getContent().length() - 1){
+        this.pos = this.document.getContent().length();
+        return reply;
+        }
+        end = this.pos + 1;
+        while (end < this.document.getContent().length() &&
+            this.document.getContent().charAt(end) != '>'){
+        end ++;
+        }
+        // get the whole of the tag into 'reply', and
+        // set the current pos to immediately after the tag
+        if (end < this.document.getContent().length()){
+        reply = this.document.getContent().substring(this.pos, end + 1);
+        this.pos = end + 1;
+        }
+        else{
+        this.pos = end;
+        }
+    }
+    return reply;
+    }
+    /* --
+       -- return next HREF value we come across in the document
+       --
+       -- NB: the checking of the quotes etc is rather lazy - and should be tidied
+       --
+    */
+    public String nextHREF()
+    {
+    String reply;
+    int start, end;
+    boolean quoted;
+    HTMLTag tag;
+    reply = this.nextParse();
+    while (this.pos < this.document.getContent().length()){
+        tag = new HTMLTag(reply);
+        if (tag.tagName().equals("a")) {
+        start = reply.indexOf("href");
+        if (start == -1) {
+            start = reply.indexOf("HREF");
+        }
+        if (start >= 0) {
+            start += 4;
+            quoted = false;
+            while (reply.charAt(start) == ' ' ||
+               reply.charAt(start) == '=' ||
+               reply.charAt(start) == '"') {
+            if (reply.charAt(start) == '"') {
+                quoted = true;
+            }
+          end = this.pos + 1;
+            while ( end < this.document.getContent().length() &&
+                    this.document.getContent().charAt(end) != '>')
+            { end ++;
+            start ++;
+            }
+            end = -1;
+            if (quoted) {
+            end = reply.indexOf('"', start+1);
+            }
+            if (end == -1) {
+            end = reply.indexOf(' ', start+1);
+            if (end == -1) {
+                end = reply.length() - 1;
+            }
+            // get the whole of the tag into 'reply', and
+            // set the current pos to immediately after the tag
+            if (end < this.document.getContent().length())
+            { reply = this.document.getContent().substring(this.pos, end + 1);
+                this.pos = end + 1;
+            }
+            else
+            { this.pos = end;
+            }
+        }
+            }
+            reply = reply.substring(start, end);
+            return reply;
+        }
+        }
+        reply = this.nextParse();
+    }
+    return null;
+    }
+    // Return the next link in the page
+    public String nextLink(AppletContext ac)
+    {
+    String reply, reply2;
+    int start, end;
+    boolean quoted;
+    HTMLTag tag;
+    if (this.document.getContent() == null){
+        return null;
+    }
+    reply = this.nextParse();
+    while (this.pos < this.document.getContent().length()){
+        tag = new HTMLTag(reply);
+        if (tag.tagName().equals("a")){
+        reply = tag.idValue("href");
+        }
+        else if (tag.tagName().equals("frame")){ // image tag
+        reply = tag.idValue("src");
+        }
+        else if (tag.tagName().equals("area")) { // image map area
+        reply = tag.idValue("href");
+        }
+        else if (tag.tagName().equals("frame")){
+        reply = tag.idValue("src");
+        }
+        else {
+        reply   = null;
+        }
+        if (reply != null){
         return reply;
+    }
+    /* --
+         -- return next HREF value we come across in the document
+         --
+         -- NB: the checking of the quotes etc is rather lazy - and should be tidied
+         --
+    */
+    public String nextHREF()
+    { String reply;
+        int         start, end;
+        boolean quoted;
+        HTMLTag tag;
+        reply = this.nextParse();
+        while (this.pos < this.document.getContent().length())
+        { tag = new HTMLTag(reply);
+            if (tag.tagName().equals("a"))
+            { start = reply.indexOf("href");
+                if (start == -1)
+                { start = reply.indexOf("HREF");
+                }
+                if (start >= 0)
+                { start += 4;
+                    quoted = false;
+                    while   (reply.charAt(start) == ' ' ||
+                                 reply.charAt(start) == '=' ||
+                                 reply.charAt(start) == '"')
+                    { if (reply.charAt(start) == '"')
+                        { quoted = true;
+                        }
+                        start ++;
+                    }
+                    end = -1;
+                    if (quoted)
+                    { end = reply.indexOf('"', start+1);
+                    }
+                    if (end == -1)
+                    { end = reply.indexOf(' ', start+1);
+                        if (end == -1)
+                        { end = reply.length() - 1;
+                        }
+                    }
+                    reply = reply.substring(start, end);
+                  return reply;
+                }
+            }
+            reply = this.nextParse();
+        }
+        return null;
+    }
+    // Return the next link in the page
+    public String nextLink(AppletContext ac)
+    { String reply, reply2;
+        int         start, end;
+        boolean quoted;
+        HTMLTag tag;
+        if (this.document.getContent() == null)
+        {   return null;
+        }
+        reply = this.nextParse();
+        while (this.pos < this.document.getContent().length())
+        { tag = new HTMLTag(reply);
+            if (tag.tagName().equals("a"))
+            { reply = tag.idValue("href");
+            }
+            else if (tag.tagName().equals("frame")) // image tag
+            { reply = tag.idValue("src");
+            }
+            else if (tag.tagName().equals("area")) // image map area
+            { reply = tag.idValue("href");
+            }
+            else if (tag.tagName().equals("frame"))
+            {   reply = tag.idValue("src");
+            }
+            else
+            {   reply   = null;
+            }
+            if (reply != null)
+            {   return reply;
+            }
+            reply = this.nextParse();
+        }
+        return null;
+    }
+        }
+        reply = this.nextParse();
+    }
+    return null;
+    }
+}

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/util/HTMLTidy.java

-              r8705
+              r8710
+    {
         String comment = new String(ch, start, length);
         parent.comment(comment);
+    }
 …
         tagName.equals("link") ||
         tagName.equals("base") ||
         tagName.equals("img") ||
+        //tagName.equals("img") ||
         tagName.equals("hr")) {
         isSingleton = true;

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 8710

Legend:

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/util/HTMLParser.java

trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/util/HTMLTidy.java

Download in other formats: