source: trunk/greenstone3-extensions/gs3build/src/org/greenstone/gsdl3/gs3build/util/HTMLParser.java@ 12188

Last change on this file since 12188 was 12188, checked in by kjdon, 18 years ago

Initial revision

  • Property svn:keywords set to Author Date Id Revision
File size: 5.5 KB
Line 
1package org.greenstone.gsdl3.gs3build.util;
2
3import java.applet.*;
4
5public class HTMLParser
6{
7 HTMLDoc document;
8 int pos;
9 int lastpos;
10
11 public HTMLParser(HTMLDoc document)
12 {
13 this.document = document;
14 this.pos = 0;
15 }
16
17 public void startParse()
18 {
19 this.pos = 0;
20 this.lastpos = -1;
21 //System.out.println("Starting "+document.urlString());
22 }
23
24 public int atParse()
25 {
26 return this.pos;
27 }
28
29 public int lastParse()
30 {
31 return this.lastpos;
32 }
33
34 // do a full text/tag parse
35 public String fullParse()
36 {
37 String reply = null;
38 int end;
39 int start;
40
41 this.lastpos = this.pos;
42
43 if (this.pos >= this.document.getContent().length()){
44 return reply;
45 }
46
47 if (this.document.getContent().charAt(this.pos) == '<'){
48 start = this.pos;
49
50 // if we're not at the end of the document,
51 // read the rest of the tag
52 if (this.pos == this.document.getContent().length() - 1){
53 this.pos = this.document.getContent().length();
54 return reply;
55 }
56
57 // if the tag is a comment
58 if (this.pos < this.document.getContent().length() - 3 &&
59 this.document.getContent().substring(this.pos, this.pos+4).equals("<!--")){
60 end = this.document.getContent().substring(this.pos).indexOf("-->") + 3 + this.pos;
61 reply = this.document.getContent().substring(this.pos, end);
62 this.pos = end;
63 }
64 else
65 { // read up to the end of the tag
66 end = this.pos + 1;
67 while (end < this.document.getContent().length() &&
68 this.document.getContent().charAt(end) != '>'){
69 end ++;
70 }
71
72 // get the whole of the tag into 'reply', and
73 // set the current pos to immediately after the tag
74 if (end < this.document.getContent().length()){
75 reply = this.document.getContent().substring(this.pos, end + 1);
76 this.pos = end + 1;
77 }
78 // patch the trailing > onto the tag string
79 else {
80 reply = this.document.getContent().substring(this.pos, end) + ">";
81 this.pos = end;
82 }
83 }
84 }
85 else
86 { // hunt for the beginning of the next tag
87 start = this.pos;
88 while ((this.pos < this.document.getContent().length()) &&
89 (this.document.getContent().charAt(this.pos) != '<')){
90 this.pos ++;
91 }
92
93 // return everything up to that tag
94 reply = this.document.getContent().substring(start, this.pos);
95 }
96 return reply;
97 }
98
99 // Get the next tag to parse
100 public String nextParse()
101 {
102 String reply = null;
103 int end;
104
105 this.lastpos = this.pos;
106
107 if (this.document.getContent() == null){
108 return null;
109 }
110
111 if (this.pos >= this.document.getContent().length()){
112 return reply;
113 }
114
115 // hunt for the beginning of the next tag
116 while ((this.pos < this.document.getContent().length()) &&
117 (this.document.getContent().charAt(this.pos) != '<')){
118 this.pos ++;
119 }
120
121 // if we're not at the end of the document,
122 // read the rest of the tag
123 if (this.pos < this.document.getContent().length()){
124 if (this.pos == this.document.getContent().length() - 1){
125 this.pos = this.document.getContent().length();
126 return reply;
127 }
128
129 end = this.pos + 1;
130 while (end < this.document.getContent().length() &&
131 this.document.getContent().charAt(end) != '>'){
132 end ++;
133 }
134
135 // get the whole of the tag into 'reply', and
136 // set the current pos to immediately after the tag
137 if (end < this.document.getContent().length()){
138 reply = this.document.getContent().substring(this.pos, end + 1);
139 this.pos = end + 1;
140 }
141 else{
142 this.pos = end;
143 }
144 }
145 return reply;
146 }
147
148
149 /* --
150 -- return next HREF value we come across in the document
151 --
152 -- NB: the checking of the quotes etc is rather lazy - and should be tidied
153 --
154 */
155 public String nextHREF()
156 {
157 String reply;
158 int start, end;
159 boolean quoted;
160 HTMLTag tag;
161
162 reply = this.nextParse();
163 while (this.pos < this.document.getContent().length()){
164 tag = new HTMLTag(reply);
165
166 if (tag.tagName().equals("a")) {
167 start = reply.indexOf("href");
168 if (start == -1) {
169 start = reply.indexOf("HREF");
170 }
171
172 if (start >= 0) {
173 start += 4;
174 quoted = false;
175 while (reply.charAt(start) == ' ' ||
176 reply.charAt(start) == '=' ||
177 reply.charAt(start) == '"') {
178 if (reply.charAt(start) == '"') {
179 quoted = true;
180 }
181 start ++;
182 }
183
184 end = -1;
185 if (quoted) {
186 end = reply.indexOf('"', start+1);
187 }
188 if (end == -1) {
189 end = reply.indexOf(' ', start+1);
190 if (end == -1) {
191 end = reply.length() - 1;
192 }
193 }
194
195 reply = reply.substring(start, end);
196 return reply;
197 }
198 }
199 reply = this.nextParse();
200 }
201 return null;
202 }
203
204 // Return the next link in the page
205 public String nextLink(AppletContext ac)
206 {
207 String reply, reply2;
208 int start, end;
209 boolean quoted;
210 HTMLTag tag;
211
212 if (this.document.getContent() == null){
213 return null;
214 }
215
216 reply = this.nextParse();
217 while (this.pos < this.document.getContent().length()){
218 tag = new HTMLTag(reply);
219 if (tag.tagName().equals("a")){
220 reply = tag.idValue("href");
221 }
222 else if (tag.tagName().equals("frame")){ // image tag
223 reply = tag.idValue("src");
224 }
225 else if (tag.tagName().equals("area")) { // image map area
226 reply = tag.idValue("href");
227 }
228 else if (tag.tagName().equals("frame")){
229 reply = tag.idValue("src");
230 }
231 else {
232 reply = null;
233 }
234
235 if (reply != null){
236 return reply;
237 }
238
239 reply = this.nextParse();
240 }
241 return null;
242 }
243}
Note: See TracBrowser for help on using the repository browser.