source: trunk/greenstone3-extensions/gs3build/src/org/greenstone/gsdl3/gs3build/util/HTMLBlockList.java@ 12188

Last change on this file since 12188 was 12188, checked in by kjdon, 18 years ago

Initial revision

  • Property svn:keywords set to Author Date Id Revision
File size: 10.5 KB
Line 
1package org.greenstone.gsdl3.gs3build.util;
2
3import java.util.*;
4
5public class HTMLBlockList
6{ Vector blocks;
7 String title;
8 protected static HTMLTagVector anchorStartTags;
9 protected static HTMLTagVector anchorEndTags;
10 protected static HTMLTagVector anchorOptEndTags;
11 protected static HTMLTagVector headingStartTags;
12 protected static HTMLTagVector headingEndTags;
13
14 static
15 { anchorStartTags = new HTMLTagVector("a");
16 anchorEndTags = new HTMLTagVector("a,h1,h2,h3,h4,h5,h6,table,hr,p,ul,ol,dl,dt,dd,li,tr,td,th,map");
17 anchorOptEndTags = new HTMLTagVector("div,center,form");
18 headingStartTags = new HTMLTagVector("h1,h2,h3,h4,h5,h6");
19 headingEndTags = new HTMLTagVector("h1,h2,h3,h4,h5,h6,table,hr,p,ul,ol,dl,dt,dd,li,tr,td,th,map");
20 }
21
22 // opening opentag incontent result
23 // T T T * {if opening, incontent must be false}
24 // T T F T
25 // T F T * {if opening, incontent must be false}
26 // T F F F
27 // F T T T
28 // F T F F {if getting an opening tag
29 // F F T T
30 // F F F T
31 private boolean inTagList(HTMLTag tag, Vector tags, boolean opening)
32 { int tagref;
33
34 for (tagref = 0; tagref < tags.size(); tagref ++)
35 { if ((tag.xtagName().equals((String) tags.elementAt(tagref))) &&
36 (tag.tagIsOpening() || !opening))
37 { return true;
38 }
39 }
40 return false;
41 }
42
43 //
44 // Common initialisation; parse the document for text surrounded by
45 // pairs of <starttag>....<endtag/starttag> combos
46 //
47 protected void initialise(HTMLDoc doc, HTMLTagVector starttags, HTMLTagVector endtags, HTMLTagVector optendtags)
48 { boolean intag;
49 boolean incontent;
50 HTMLCText text;
51 HTMLTag tag;
52 HTMLTag starttag;
53 HTMLBlock block;
54 HTMLBlock docBlock;
55 Enumeration elements;
56 Object element;
57
58 this.blocks = new Vector(1);
59 block = null;
60 intag = false;
61 incontent = false;
62 starttag = null;
63
64 docBlock = doc.getCodedContent();
65 elements = docBlock.elements();
66
67 while (elements.hasMoreElements())
68 { element = elements.nextElement();
69
70 // if it's a tag
71 if (element instanceof HTMLTag)
72 { tag = (HTMLTag) element;
73
74 // end the fragment if getting a closing tag
75 if (intag)
76 { if (endtags.matches(tag, false) ||
77 (incontent && optendtags.matches(tag, false)))
78 { block.addTag(starttag.endTag());
79 intag = false;
80 incontent = false;
81 }
82 else
83 { block.addTag(tag);
84 }
85 }
86
87 // !intag is checked for explicitly, as it may have been
88 // switched off by a non-partner opening tag above: eg:
89 // <H2>A Heading<H3>A Smaller Heading</H3>
90 // ^^^^
91 // here
92
93 // start fragment if the tag is a starter
94 if (!intag)
95 { if (starttags.matches(tag, true))
96 { block = new HTMLBlock(tag.startPos(), tag.endPos());
97 this.blocks.addElement((Object) block);
98 block.addTag(tag);
99 starttag = tag;
100 intag = true;
101 incontent = false;
102 }
103 }
104 }
105 // if the parse returned text
106 else
107 { text = (HTMLCText) element;
108
109 if (intag)
110 { block.addText(text);
111 if (!text.nullString())
112 { incontent = true;
113 }
114 }
115 }
116 }
117
118 // if a tag wasn't "finished");
119 if (intag)
120 { block.addTag(starttag.endTag());
121 }
122
123 this.title = doc.urlString();
124 }
125
126 protected HTMLBlockList()
127 {
128 }
129
130 //
131 // constructor for given vectors of tags: startlist elements always start a new
132 // block [ending an open one if necessary]; endlist elements always end an open
133 // block; optendlist elements close a block iff a block has been started, and
134 // some non-blank content (ie. some "real" text, not just returns/spaces) has
135 // been read since the beginning of the block
136 //
137 public HTMLBlockList(HTMLDoc doc, Vector starttags, Vector endtags, Vector optendtags)
138 { HTMLTagVector _starttags;
139 HTMLTagVector _endtags;
140 HTMLTagVector _optendtags;
141
142 _starttags = new HTMLTagVector(starttags);
143 _endtags = new HTMLTagVector(starttags);
144 _optendtags = new HTMLTagVector(starttags);
145 this.initialise(doc, _starttags, _endtags, _optendtags);
146 }
147
148 //
149 // constructor for explicit lists; a tagVector is created for each list;
150 // see above for list roles
151 //
152 public HTMLBlockList(HTMLDoc doc, String startlist, String endlist, String optendlist)
153 { HTMLTagVector starttags;
154 HTMLTagVector endtags;
155 HTMLTagVector optendtags;
156
157 starttags = new HTMLTagVector(startlist);
158 endtags = new HTMLTagVector(endlist);
159 optendtags = new HTMLTagVector(optendlist);
160 this.initialise(doc, starttags, endtags, optendtags);
161 }
162
163 //
164 // constructor; get all HTMLBlocks for h{level} through to h1;
165 // eg hlevel==3 test h3,h2,h1
166 //
167 public HTMLBlockList(HTMLDoc doc, int hlevel)
168 { int loop;
169 StringBuffer levelstring;
170 HTMLTagVector starttags;
171 HTMLTagVector endtags;
172 HTMLTagVector optendtags;
173
174 levelstring = new StringBuffer("h1");
175 for (loop = 2; loop <= hlevel; loop ++)
176 { levelstring.append(",h");
177 levelstring.append(Integer.toString(loop));
178 }
179
180 starttags = new HTMLTagVector(levelstring.toString());
181 levelstring.append(",table,hr,p,ul,ol,dl,dt,dd,li,tr,td,th,map");
182 endtags = new HTMLTagVector(levelstring.toString());
183 optendtags = new HTMLTagVector("div,center,form");
184
185 this.initialise(doc, starttags, endtags, optendtags);
186 }
187
188 /**
189 * Static method returning list of anchors of a document
190 */
191 public static synchronized HTMLBlockList anchorlist(HTMLDoc doc)
192 { HTMLBlockList anchors;
193 HTMLTagVector starttags;
194 HTMLTagVector endtags;
195 HTMLTagVector optendtags;
196
197 anchors = new HTMLBlockList();
198
199 anchors.initialise(doc, anchorStartTags, anchorEndTags, anchorOptEndTags);
200
201 return anchors;
202 }
203
204 //
205 // return how many tags are in the block list
206 //
207 public int size()
208 { if (this.blocks == null)
209 { return 0;
210 }
211
212 return this.blocks.size();
213 }
214
215 /**
216 * @return number of blocks in the block list
217 * @deprecated
218 */
219 public int tagCount()
220 { return this.size();
221 }
222
223 //
224 // return the position of the tag in the document
225 //
226 public int tagPos(int tag)
227 { HTMLBlock block;
228
229 if (this.blocks == null)
230 { return 0;
231 }
232 block = (HTMLBlock) this.blocks.elementAt(tag);
233 return block.startPos();
234 }
235
236 //
237 // return the nth block in the list
238 //
239 public HTMLBlock tagBlock(int tag)
240 { HTMLBlock block;
241
242 block = (HTMLBlock) this.blocks.elementAt(tag);
243 return block;
244 }
245
246 //
247 // return the index of the next blocklist of the same level
248 //
249 public int tagNext(int attag)
250 { HTMLBlock block;
251 String headTag;
252 int nextTag;
253
254 block = (HTMLBlock) this.blocks.elementAt(attag);
255 headTag = block.headTagName();
256 nextTag = attag+1;
257
258 while (nextTag < this.blocks.size())
259 { block = (HTMLBlock) this.blocks.elementAt(nextTag);
260 if (block.headTagName().equals(headTag))
261 { break;
262 }
263 nextTag ++;
264 }
265 return nextTag;
266 }
267
268 //
269 // return the number of children at the next heading level
270 //
271 public int tagChildren(int fromtag, int level)
272 { int sibling;
273 int descendent;
274 int children;
275 int atlevel;
276 HTMLTag tag;
277
278 // get current tag level; level 6 headings cannot have
279 // children
280 if (level == 6)
281 { return 0;
282 }
283
284 // now, if the next tag at this level is the next tag in
285 // the document, then we can't have any children
286 if (fromtag > 0)
287 { sibling = this.tagNext(fromtag-1);
288 }
289 else
290 { sibling = this.size();
291 }
292 if (sibling == fromtag)
293 { children = 0;
294 }
295 // if the next tag at this level is *not* next, then we have
296 // at least one child (ie the next tag in sequence), even if
297 // it is not at the next heading level down.
298 //
299 // if no further children occur at the next level down, we
300 // have only one child. If any further children occur at the
301 // next level down, then they are counted, except where it is
302 // the next tag in sequence, which is automatically accounted
303 // for
304 else
305 { children = 1;
306 for (descendent = fromtag+1; descendent < sibling; descendent ++)
307 { tag = this.tagBlock(descendent).headTag();
308
309 if (tag.tagLevel() == level + 1)
310 { children ++;
311 }
312 }
313 }
314 return children;
315 }
316
317
318 public int tagChildren(int attag)
319 { int sibling;
320 int descendent;
321 int children;
322 int atlevel;
323 HTMLTag tag;
324
325 // get current tag level; level 6 headings cannot have
326 // children
327 atlevel = this.tagBlock(attag).headTag().tagLevel();
328 if (atlevel == 6)
329 { return 0;
330 }
331
332 // now, if the next tag at this level is the next tag in
333 // the document, then we can't have any children
334 sibling = this.tagNext(attag);
335 if (sibling == attag + 1)
336 { children = 0;
337 }
338 // if the next tag at this level is *not* next, then we have
339 // at least one child (ie the next tag in sequence), even if
340 // it is not at the next heading level down.
341 //
342 // if no further children occur at the next level down, we
343 // have only one child. If any further children occur at the
344 // next level down, then they are counted, except where it is
345 // the next tag in sequence, which is automatically accounted
346 // for
347 else
348 { children = 1;
349 for (descendent = attag + 2; descendent < sibling; descendent ++)
350 { tag = this.tagBlock(descendent).headTag();
351
352 if (tag.tagLevel() == atlevel + 1)
353 { children ++;
354 }
355 }
356 }
357 return children;
358 }
359
360 //
361 // write out a structured output onto System.out
362 //
363 public void sysoutList()
364 { int item;
365 HTMLBlock block;
366
367 if (this.blocks == null)
368 { return;
369 }
370
371 for (item = 0; item < this.blocks.size(); item ++)
372 { block = (HTMLBlock) this.blocks.elementAt(item);
373 System.out.println(" @"+ block.startPos()+":"+block.headTagName() +":"+ block);
374 }
375 }
376
377 //
378 // write out HTMLBlocks as an Enumeration
379 //
380 public Enumeration elements()
381 { return new HTMLBlockListEnumerator(this);
382 }
383}
384
385final class HTMLBlockListEnumerator implements Enumeration
386{ private HTMLBlockList blocklist;
387 private int member;
388
389 public HTMLBlockListEnumerator(HTMLBlockList blocklist)
390 { this.blocklist = blocklist;
391 this.member = 0;
392 }
393
394 public Object nextElement()
395 { Object element;
396
397 element = this.blocklist.blocks.elementAt(member);
398 member ++;
399 return element;
400 }
401
402 public boolean hasMoreElements()
403 { if (this.blocklist == null || this.member == this.blocklist.blocks.size())
404 { return false;
405 }
406 return true;
407 }
408}
409
Note: See TracBrowser for help on using the repository browser.