source: trunk/gsdl3/src/java/org/greenstone/gsdl3/gs3build/util/HTMLParser.java@ 6700

Last change on this file since 6700 was 5947, checked in by cs025, 21 years ago

Improvements to SQL handling

  • Property svn:keywords set to Author Date Id Revision
File size: 5.4 KB
Line 
1package org.greenstone.gsdl3.gs3build.util;
2
3import java.applet.*;
4
5public class HTMLParser
6{ HTMLDoc document;
7 int pos;
8 int lastpos;
9
10 public HTMLParser(HTMLDoc document)
11 { this.document = document;
12 this.pos = 0;
13 }
14
15 public void startParse()
16 { this.pos = 0;
17 this.lastpos = -1;
18// System.out.println("Starting "+document.urlString());
19 }
20
21 public int atParse()
22 { return this.pos;
23 }
24
25 public int lastParse()
26 { return this.lastpos;
27 }
28
29 // do a full text/tag parse
30 public String fullParse()
31 { String reply = null;
32 int end;
33 int start;
34
35 this.lastpos = this.pos;
36
37 if (this.pos >= this.document.getContent().length())
38 { return reply;
39 }
40
41 if (this.document.getContent().charAt(this.pos) == '<')
42 {
43 start = this.pos;
44
45 // if we're not at the end of the document,
46 // read the rest of the tag
47 if (this.pos == this.document.getContent().length() - 1)
48 { this.pos = this.document.getContent().length();
49 return reply;
50 }
51
52 // if the tag is a comment
53 if (this.pos < this.document.getContent().length() - 3 &&
54 this.document.getContent().substring(this.pos, this.pos+4).equals("<!--"))
55 { end = this.document.getContent().substring(this.pos).indexOf("-->") + 3 + this.pos;
56 reply = this.document.getContent().substring(this.pos, end);
57 this.pos = end;
58 }
59 else
60 { // read up to the end of the tag
61 end = this.pos + 1;
62 while (end < this.document.getContent().length() &&
63 this.document.getContent().charAt(end) != '>')
64 { end ++;
65 }
66
67 // get the whole of the tag into 'reply', and
68 // set the current pos to immediately after the tag
69 if (end < this.document.getContent().length())
70 { reply = this.document.getContent().substring(this.pos, end + 1);
71 this.pos = end + 1;
72 }
73 // patch the trailing > onto the tag string
74 else
75 { reply = this.document.getContent().substring(this.pos, end) + ">";
76 this.pos = end;
77 }
78 }
79 }
80 else
81 { // hunt for the beginning of the next tag
82 start = this.pos;
83 while ((this.pos < this.document.getContent().length()) &&
84 (this.document.getContent().charAt(this.pos) != '<'))
85 { this.pos ++;
86 }
87
88 // return everything up to that tag
89 reply = this.document.getContent().substring(start, this.pos);
90 }
91 return reply;
92 }
93
94 // Get the next tag to parse
95 public String nextParse()
96 { String reply = null;
97 int end;
98
99 this.lastpos = this.pos;
100
101 if (this.document.getContent() == null)
102 { return null;
103 }
104
105 if (this.pos >= this.document.getContent().length())
106 { return reply;
107 }
108
109 // hunt for the beginning of the next tag
110 while ((this.pos < this.document.getContent().length()) &&
111 (this.document.getContent().charAt(this.pos) != '<'))
112 { this.pos ++;
113 }
114
115 // if we're not at the end of the document,
116 // read the rest of the tag
117 if (this.pos < this.document.getContent().length())
118 { if (this.pos == this.document.getContent().length() - 1)
119 { this.pos = this.document.getContent().length();
120 return reply;
121 }
122
123 end = this.pos + 1;
124 while ( end < this.document.getContent().length() &&
125 this.document.getContent().charAt(end) != '>')
126 { end ++;
127 }
128
129 // get the whole of the tag into 'reply', and
130 // set the current pos to immediately after the tag
131 if (end < this.document.getContent().length())
132 { reply = this.document.getContent().substring(this.pos, end + 1);
133 this.pos = end + 1;
134 }
135 else
136 { this.pos = end;
137 }
138 }
139 return reply;
140 }
141
142
143 /* --
144 -- return next HREF value we come across in the document
145 --
146 -- NB: the checking of the quotes etc is rather lazy - and should be tidied
147 --
148 */
149 public String nextHREF()
150 { String reply;
151 int start, end;
152 boolean quoted;
153 HTMLTag tag;
154
155 reply = this.nextParse();
156 while (this.pos < this.document.getContent().length())
157 { tag = new HTMLTag(reply);
158
159 if (tag.tagName().equals("a"))
160 { start = reply.indexOf("href");
161 if (start == -1)
162 { start = reply.indexOf("HREF");
163 }
164
165 if (start >= 0)
166 { start += 4;
167 quoted = false;
168 while (reply.charAt(start) == ' ' ||
169 reply.charAt(start) == '=' ||
170 reply.charAt(start) == '"')
171 { if (reply.charAt(start) == '"')
172 { quoted = true;
173 }
174 start ++;
175 }
176
177 end = -1;
178 if (quoted)
179 { end = reply.indexOf('"', start+1);
180 }
181 if (end == -1)
182 { end = reply.indexOf(' ', start+1);
183 if (end == -1)
184 { end = reply.length() - 1;
185 }
186 }
187
188 reply = reply.substring(start, end);
189 return reply;
190 }
191 }
192 reply = this.nextParse();
193 }
194 return null;
195 }
196
197 // Return the next link in the page
198 public String nextLink(AppletContext ac)
199 { String reply, reply2;
200 int start, end;
201 boolean quoted;
202 HTMLTag tag;
203
204 if (this.document.getContent() == null)
205 { return null;
206 }
207
208 reply = this.nextParse();
209 while (this.pos < this.document.getContent().length())
210 { tag = new HTMLTag(reply);
211 if (tag.tagName().equals("a"))
212 { reply = tag.idValue("href");
213 }
214 else if (tag.tagName().equals("frame")) // image tag
215 { reply = tag.idValue("src");
216 }
217 else if (tag.tagName().equals("area")) // image map area
218 { reply = tag.idValue("href");
219 }
220 else if (tag.tagName().equals("frame"))
221 { reply = tag.idValue("src");
222 }
223 else
224 { reply = null;
225 }
226
227 if (reply != null)
228 { return reply;
229 }
230
231 reply = this.nextParse();
232 }
233 return null;
234 }
235}
Note: See TracBrowser for help on using the repository browser.