1 | package org.greenstone.gsdl3.gs3build.util;
|
---|
2 |
|
---|
3 | import java.util.*;
|
---|
4 |
|
---|
5 | public class HTMLBlockList
|
---|
6 | { Vector blocks;
|
---|
7 | String title;
|
---|
8 | protected static HTMLTagVector anchorStartTags;
|
---|
9 | protected static HTMLTagVector anchorEndTags;
|
---|
10 | protected static HTMLTagVector anchorOptEndTags;
|
---|
11 | protected static HTMLTagVector headingStartTags;
|
---|
12 | protected static HTMLTagVector headingEndTags;
|
---|
13 |
|
---|
14 | static
|
---|
15 | { anchorStartTags = new HTMLTagVector("a");
|
---|
16 | anchorEndTags = new HTMLTagVector("a,h1,h2,h3,h4,h5,h6,table,hr,p,ul,ol,dl,dt,dd,li,tr,td,th,map");
|
---|
17 | anchorOptEndTags = new HTMLTagVector("div,center,form");
|
---|
18 | headingStartTags = new HTMLTagVector("h1,h2,h3,h4,h5,h6");
|
---|
19 | headingEndTags = new HTMLTagVector("h1,h2,h3,h4,h5,h6,table,hr,p,ul,ol,dl,dt,dd,li,tr,td,th,map");
|
---|
20 | }
|
---|
21 |
|
---|
22 | // opening opentag incontent result
|
---|
23 | // T T T * {if opening, incontent must be false}
|
---|
24 | // T T F T
|
---|
25 | // T F T * {if opening, incontent must be false}
|
---|
26 | // T F F F
|
---|
27 | // F T T T
|
---|
28 | // F T F F {if getting an opening tag
|
---|
29 | // F F T T
|
---|
30 | // F F F T
|
---|
31 | private boolean inTagList(HTMLTag tag, Vector tags, boolean opening)
|
---|
32 | { int tagref;
|
---|
33 |
|
---|
34 | for (tagref = 0; tagref < tags.size(); tagref ++)
|
---|
35 | { if ((tag.xtagName().equals((String) tags.elementAt(tagref))) &&
|
---|
36 | (tag.tagIsOpening() || !opening))
|
---|
37 | { return true;
|
---|
38 | }
|
---|
39 | }
|
---|
40 | return false;
|
---|
41 | }
|
---|
42 |
|
---|
43 | //
|
---|
44 | // Common initialisation; parse the document for text surrounded by
|
---|
45 | // pairs of <starttag>....<endtag/starttag> combos
|
---|
46 | //
|
---|
47 | protected void initialise(HTMLDoc doc, HTMLTagVector starttags, HTMLTagVector endtags, HTMLTagVector optendtags)
|
---|
48 | { boolean intag;
|
---|
49 | boolean incontent;
|
---|
50 | HTMLCText text;
|
---|
51 | HTMLTag tag;
|
---|
52 | HTMLTag starttag;
|
---|
53 | HTMLBlock block;
|
---|
54 | HTMLBlock docBlock;
|
---|
55 | Enumeration elements;
|
---|
56 | Object element;
|
---|
57 |
|
---|
58 | this.blocks = new Vector(1);
|
---|
59 | block = null;
|
---|
60 | intag = false;
|
---|
61 | incontent = false;
|
---|
62 | starttag = null;
|
---|
63 |
|
---|
64 | docBlock = doc.getCodedContent();
|
---|
65 | elements = docBlock.elements();
|
---|
66 |
|
---|
67 | while (elements.hasMoreElements())
|
---|
68 | { element = elements.nextElement();
|
---|
69 |
|
---|
70 | // if it's a tag
|
---|
71 | if (element instanceof HTMLTag)
|
---|
72 | { tag = (HTMLTag) element;
|
---|
73 |
|
---|
74 | // end the fragment if getting a closing tag
|
---|
75 | if (intag)
|
---|
76 | { if (endtags.matches(tag, false) ||
|
---|
77 | (incontent && optendtags.matches(tag, false)))
|
---|
78 | { block.addTag(starttag.endTag());
|
---|
79 | intag = false;
|
---|
80 | incontent = false;
|
---|
81 | }
|
---|
82 | else
|
---|
83 | { block.addTag(tag);
|
---|
84 | }
|
---|
85 | }
|
---|
86 |
|
---|
87 | // !intag is checked for explicitly, as it may have been
|
---|
88 | // switched off by a non-partner opening tag above: eg:
|
---|
89 | // <H2>A Heading<H3>A Smaller Heading</H3>
|
---|
90 | // ^^^^
|
---|
91 | // here
|
---|
92 |
|
---|
93 | // start fragment if the tag is a starter
|
---|
94 | if (!intag)
|
---|
95 | { if (starttags.matches(tag, true))
|
---|
96 | { block = new HTMLBlock(tag.startPos(), tag.endPos());
|
---|
97 | this.blocks.addElement((Object) block);
|
---|
98 | block.addTag(tag);
|
---|
99 | starttag = tag;
|
---|
100 | intag = true;
|
---|
101 | incontent = false;
|
---|
102 | }
|
---|
103 | }
|
---|
104 | }
|
---|
105 | // if the parse returned text
|
---|
106 | else
|
---|
107 | { text = (HTMLCText) element;
|
---|
108 |
|
---|
109 | if (intag)
|
---|
110 | { block.addText(text);
|
---|
111 | if (!text.nullString())
|
---|
112 | { incontent = true;
|
---|
113 | }
|
---|
114 | }
|
---|
115 | }
|
---|
116 | }
|
---|
117 |
|
---|
118 | // if a tag wasn't "finished");
|
---|
119 | if (intag)
|
---|
120 | { block.addTag(starttag.endTag());
|
---|
121 | }
|
---|
122 |
|
---|
123 | this.title = doc.urlString();
|
---|
124 | }
|
---|
125 |
|
---|
126 | protected HTMLBlockList()
|
---|
127 | {
|
---|
128 | }
|
---|
129 |
|
---|
130 | //
|
---|
131 | // constructor for given vectors of tags: startlist elements always start a new
|
---|
132 | // block [ending an open one if necessary]; endlist elements always end an open
|
---|
133 | // block; optendlist elements close a block iff a block has been started, and
|
---|
134 | // some non-blank content (ie. some "real" text, not just returns/spaces) has
|
---|
135 | // been read since the beginning of the block
|
---|
136 | //
|
---|
137 | public HTMLBlockList(HTMLDoc doc, Vector starttags, Vector endtags, Vector optendtags)
|
---|
138 | { HTMLTagVector _starttags;
|
---|
139 | HTMLTagVector _endtags;
|
---|
140 | HTMLTagVector _optendtags;
|
---|
141 |
|
---|
142 | _starttags = new HTMLTagVector(starttags);
|
---|
143 | _endtags = new HTMLTagVector(starttags);
|
---|
144 | _optendtags = new HTMLTagVector(starttags);
|
---|
145 | this.initialise(doc, _starttags, _endtags, _optendtags);
|
---|
146 | }
|
---|
147 |
|
---|
148 | //
|
---|
149 | // constructor for explicit lists; a tagVector is created for each list;
|
---|
150 | // see above for list roles
|
---|
151 | //
|
---|
152 | public HTMLBlockList(HTMLDoc doc, String startlist, String endlist, String optendlist)
|
---|
153 | { HTMLTagVector starttags;
|
---|
154 | HTMLTagVector endtags;
|
---|
155 | HTMLTagVector optendtags;
|
---|
156 |
|
---|
157 | starttags = new HTMLTagVector(startlist);
|
---|
158 | endtags = new HTMLTagVector(endlist);
|
---|
159 | optendtags = new HTMLTagVector(optendlist);
|
---|
160 | this.initialise(doc, starttags, endtags, optendtags);
|
---|
161 | }
|
---|
162 |
|
---|
163 | //
|
---|
164 | // constructor; get all HTMLBlocks for h{level} through to h1;
|
---|
165 | // eg hlevel==3 test h3,h2,h1
|
---|
166 | //
|
---|
167 | public HTMLBlockList(HTMLDoc doc, int hlevel)
|
---|
168 | { int loop;
|
---|
169 | StringBuffer levelstring;
|
---|
170 | HTMLTagVector starttags;
|
---|
171 | HTMLTagVector endtags;
|
---|
172 | HTMLTagVector optendtags;
|
---|
173 |
|
---|
174 | levelstring = new StringBuffer("h1");
|
---|
175 | for (loop = 2; loop <= hlevel; loop ++)
|
---|
176 | { levelstring.append(",h");
|
---|
177 | levelstring.append(Integer.toString(loop));
|
---|
178 | }
|
---|
179 |
|
---|
180 | starttags = new HTMLTagVector(levelstring.toString());
|
---|
181 | levelstring.append(",table,hr,p,ul,ol,dl,dt,dd,li,tr,td,th,map");
|
---|
182 | endtags = new HTMLTagVector(levelstring.toString());
|
---|
183 | optendtags = new HTMLTagVector("div,center,form");
|
---|
184 |
|
---|
185 | this.initialise(doc, starttags, endtags, optendtags);
|
---|
186 | }
|
---|
187 |
|
---|
188 | /**
|
---|
189 | * Static method returning list of anchors of a document
|
---|
190 | */
|
---|
191 | public static synchronized HTMLBlockList anchorlist(HTMLDoc doc)
|
---|
192 | { HTMLBlockList anchors;
|
---|
193 | HTMLTagVector starttags;
|
---|
194 | HTMLTagVector endtags;
|
---|
195 | HTMLTagVector optendtags;
|
---|
196 |
|
---|
197 | anchors = new HTMLBlockList();
|
---|
198 |
|
---|
199 | anchors.initialise(doc, anchorStartTags, anchorEndTags, anchorOptEndTags);
|
---|
200 |
|
---|
201 | return anchors;
|
---|
202 | }
|
---|
203 |
|
---|
204 | //
|
---|
205 | // return how many tags are in the block list
|
---|
206 | //
|
---|
207 | public int size()
|
---|
208 | { if (this.blocks == null)
|
---|
209 | { return 0;
|
---|
210 | }
|
---|
211 |
|
---|
212 | return this.blocks.size();
|
---|
213 | }
|
---|
214 |
|
---|
215 | /**
|
---|
216 | * @return number of blocks in the block list
|
---|
217 | * @deprecated
|
---|
218 | */
|
---|
219 | public int tagCount()
|
---|
220 | { return this.size();
|
---|
221 | }
|
---|
222 |
|
---|
223 | //
|
---|
224 | // return the position of the tag in the document
|
---|
225 | //
|
---|
226 | public int tagPos(int tag)
|
---|
227 | { HTMLBlock block;
|
---|
228 |
|
---|
229 | if (this.blocks == null)
|
---|
230 | { return 0;
|
---|
231 | }
|
---|
232 | block = (HTMLBlock) this.blocks.elementAt(tag);
|
---|
233 | return block.startPos();
|
---|
234 | }
|
---|
235 |
|
---|
236 | //
|
---|
237 | // return the nth block in the list
|
---|
238 | //
|
---|
239 | public HTMLBlock tagBlock(int tag)
|
---|
240 | { HTMLBlock block;
|
---|
241 |
|
---|
242 | block = (HTMLBlock) this.blocks.elementAt(tag);
|
---|
243 | return block;
|
---|
244 | }
|
---|
245 |
|
---|
246 | //
|
---|
247 | // return the index of the next blocklist of the same level
|
---|
248 | //
|
---|
249 | public int tagNext(int attag)
|
---|
250 | { HTMLBlock block;
|
---|
251 | String headTag;
|
---|
252 | int nextTag;
|
---|
253 |
|
---|
254 | block = (HTMLBlock) this.blocks.elementAt(attag);
|
---|
255 | headTag = block.headTagName();
|
---|
256 | nextTag = attag+1;
|
---|
257 |
|
---|
258 | while (nextTag < this.blocks.size())
|
---|
259 | { block = (HTMLBlock) this.blocks.elementAt(nextTag);
|
---|
260 | if (block.headTagName().equals(headTag))
|
---|
261 | { break;
|
---|
262 | }
|
---|
263 | nextTag ++;
|
---|
264 | }
|
---|
265 | return nextTag;
|
---|
266 | }
|
---|
267 |
|
---|
268 | //
|
---|
269 | // return the number of children at the next heading level
|
---|
270 | //
|
---|
271 | public int tagChildren(int fromtag, int level)
|
---|
272 | { int sibling;
|
---|
273 | int descendent;
|
---|
274 | int children;
|
---|
275 | int atlevel;
|
---|
276 | HTMLTag tag;
|
---|
277 |
|
---|
278 | // get current tag level; level 6 headings cannot have
|
---|
279 | // children
|
---|
280 | if (level == 6)
|
---|
281 | { return 0;
|
---|
282 | }
|
---|
283 |
|
---|
284 | // now, if the next tag at this level is the next tag in
|
---|
285 | // the document, then we can't have any children
|
---|
286 | if (fromtag > 0)
|
---|
287 | { sibling = this.tagNext(fromtag-1);
|
---|
288 | }
|
---|
289 | else
|
---|
290 | { sibling = this.size();
|
---|
291 | }
|
---|
292 | if (sibling == fromtag)
|
---|
293 | { children = 0;
|
---|
294 | }
|
---|
295 | // if the next tag at this level is *not* next, then we have
|
---|
296 | // at least one child (ie the next tag in sequence), even if
|
---|
297 | // it is not at the next heading level down.
|
---|
298 | //
|
---|
299 | // if no further children occur at the next level down, we
|
---|
300 | // have only one child. If any further children occur at the
|
---|
301 | // next level down, then they are counted, except where it is
|
---|
302 | // the next tag in sequence, which is automatically accounted
|
---|
303 | // for
|
---|
304 | else
|
---|
305 | { children = 1;
|
---|
306 | for (descendent = fromtag+1; descendent < sibling; descendent ++)
|
---|
307 | { tag = this.tagBlock(descendent).headTag();
|
---|
308 |
|
---|
309 | if (tag.tagLevel() == level + 1)
|
---|
310 | { children ++;
|
---|
311 | }
|
---|
312 | }
|
---|
313 | }
|
---|
314 | return children;
|
---|
315 | }
|
---|
316 |
|
---|
317 |
|
---|
318 | public int tagChildren(int attag)
|
---|
319 | { int sibling;
|
---|
320 | int descendent;
|
---|
321 | int children;
|
---|
322 | int atlevel;
|
---|
323 | HTMLTag tag;
|
---|
324 |
|
---|
325 | // get current tag level; level 6 headings cannot have
|
---|
326 | // children
|
---|
327 | atlevel = this.tagBlock(attag).headTag().tagLevel();
|
---|
328 | if (atlevel == 6)
|
---|
329 | { return 0;
|
---|
330 | }
|
---|
331 |
|
---|
332 | // now, if the next tag at this level is the next tag in
|
---|
333 | // the document, then we can't have any children
|
---|
334 | sibling = this.tagNext(attag);
|
---|
335 | if (sibling == attag + 1)
|
---|
336 | { children = 0;
|
---|
337 | }
|
---|
338 | // if the next tag at this level is *not* next, then we have
|
---|
339 | // at least one child (ie the next tag in sequence), even if
|
---|
340 | // it is not at the next heading level down.
|
---|
341 | //
|
---|
342 | // if no further children occur at the next level down, we
|
---|
343 | // have only one child. If any further children occur at the
|
---|
344 | // next level down, then they are counted, except where it is
|
---|
345 | // the next tag in sequence, which is automatically accounted
|
---|
346 | // for
|
---|
347 | else
|
---|
348 | { children = 1;
|
---|
349 | for (descendent = attag + 2; descendent < sibling; descendent ++)
|
---|
350 | { tag = this.tagBlock(descendent).headTag();
|
---|
351 |
|
---|
352 | if (tag.tagLevel() == atlevel + 1)
|
---|
353 | { children ++;
|
---|
354 | }
|
---|
355 | }
|
---|
356 | }
|
---|
357 | return children;
|
---|
358 | }
|
---|
359 |
|
---|
360 | //
|
---|
361 | // write out a structured output onto System.out
|
---|
362 | //
|
---|
363 | public void sysoutList()
|
---|
364 | { int item;
|
---|
365 | HTMLBlock block;
|
---|
366 |
|
---|
367 | if (this.blocks == null)
|
---|
368 | { return;
|
---|
369 | }
|
---|
370 |
|
---|
371 | for (item = 0; item < this.blocks.size(); item ++)
|
---|
372 | { block = (HTMLBlock) this.blocks.elementAt(item);
|
---|
373 | System.out.println(" @"+ block.startPos()+":"+block.headTagName() +":"+ block);
|
---|
374 | }
|
---|
375 | }
|
---|
376 |
|
---|
377 | //
|
---|
378 | // write out HTMLBlocks as an Enumeration
|
---|
379 | //
|
---|
380 | public Enumeration elements()
|
---|
381 | { return new HTMLBlockListEnumerator(this);
|
---|
382 | }
|
---|
383 | }
|
---|
384 |
|
---|
385 | final class HTMLBlockListEnumerator implements Enumeration
|
---|
386 | { private HTMLBlockList blocklist;
|
---|
387 | private int member;
|
---|
388 |
|
---|
389 | public HTMLBlockListEnumerator(HTMLBlockList blocklist)
|
---|
390 | { this.blocklist = blocklist;
|
---|
391 | this.member = 0;
|
---|
392 | }
|
---|
393 |
|
---|
394 | public Object nextElement()
|
---|
395 | { Object element;
|
---|
396 |
|
---|
397 | element = this.blocklist.blocks.elementAt(member);
|
---|
398 | member ++;
|
---|
399 | return element;
|
---|
400 | }
|
---|
401 |
|
---|
402 | public boolean hasMoreElements()
|
---|
403 | { if (this.blocklist == null || this.member == this.blocklist.blocks.size())
|
---|
404 | { return false;
|
---|
405 | }
|
---|
406 | return true;
|
---|
407 | }
|
---|
408 | }
|
---|
409 |
|
---|