source: trunk/gli/src/org/greenstone/gatherer/util/DecodeHTMLReader.java@ 4364

Last change on this file since 4364 was 4364, checked in by mdewsnip, 21 years ago

Fixed tabbing.

  • Property svn:keywords set to Author Date Id Revision
File size: 3.2 KB
Line 
1package org.greenstone.gatherer.util;
2
3import java.io.*;
4import org.greenstone.gatherer.Gatherer;
5
6public class DecodeHTMLReader
7 extends PushbackReader {
8
9 public DecodeHTMLReader(Reader source) {
10 super(source, 4);
11 }
12
13 /** Read a single character. */
14 public int read() {
15 return decode();
16 }
17
18 /** Read characters into a portion of an array. */
19 public int read(char[] cbuf, int off, int len) {
20 int count = 0;
21 for(int i = off; i < len && ready(); i++) {
22 cbuf[i] = (char)decode();
23 count++;
24 }
25 return count;
26 }
27
28 public boolean ready() {
29 try {
30 return super.ready();
31 }
32 catch (Exception error) {
33 }
34 return false;
35 }
36
37 /** Retrieve the next character off the stream. Unfortunately I have to do this a character at a time (which is slow). I also have to keep in mind that if a suspect encoded character turns out not to be I have to replace the extra characters. */
38 private int decode() {
39 int character;
40 try {
41 character = super.read();
42 }
43 catch (Exception error) {
44 character = ' ';
45 }
46 try {
47 if(character == '&') {
48 int amp = super.read();
49 switch(amp) {
50 case 'a':
51 case 'A':
52 int ampap = super.read();
53 int ampapo = super.read();
54 int ampapos = super.read();
55 int ampapossemi = super.read();
56 if((ampap == 'p' || ampap == 'P') && (ampapo == 'o' || ampapo == 'O') && (ampapos == 's' || ampapos == 'S') && ampapossemi == ';') {
57 // Read an &apos; so return an apostrophy
58 return '\'';
59 }
60 // Not a apos. Return the characters removed in the correct order.
61 super.unread(ampapossemi);
62 super.unread(ampapos);
63 super.unread(ampapo);
64 super.unread(ampap);
65 break;
66 case 'g':
67 case 'G':
68 int ampgt = super.read();
69 int ampgtsemi = super.read();
70 if((ampgt == 't' || ampgt == 'T') && ampgtsemi == ';') {
71 return '>';
72 }
73 super.unread(ampgtsemi);
74 super.unread(ampgt);
75 break;
76 case 'l':
77 case 'L':
78 int amplt = super.read();
79 int ampltsemi = super.read();
80 if((amplt == 't' || amplt == 'T') && ampltsemi == ';') {
81 return '<';
82 }
83 super.unread(ampltsemi);
84 super.unread(amplt);
85 break;
86 case 'q':
87 case 'Q':
88 int ampqu = super.read();
89 int ampquo = super.read();
90 int ampquot = super.read();
91 int ampquotsemi = super.read();
92 if((ampqu == 'u' || ampqu == 'U') && (ampquo == 'o' || ampquo == 'O') && (ampquot == 't' || ampquot == 'T') && ampquotsemi == ';') {
93 return '\"';
94 }
95 super.unread(ampquotsemi);
96 super.unread(ampquot);
97 super.unread(ampquo);
98 super.unread(ampqu);
99 break;
100 case '#':
101 int amphash = super.read();
102 int amphash3 = super.read();
103 int amphash39 = super.read();
104 int amphash39semi = super.read();
105 if(amphash == '#' && amphash3 == '3' && amphash39 == '9' && amphash39semi == ';') {
106 return '\'';
107 }
108 super.unread(amphash39semi);
109 super.unread(amphash39);
110 super.unread(amphash3);
111 super.unread(amphash);
112 break;
113 }
114 // Not a suspect. Return the character removed.
115 super.unread(amp);
116 }
117 }
118 catch (Exception error) {
119 Gatherer.printStackTrace(error);
120 }
121 // Nothing special. Simply return the character extracted.
122 return character;
123 }
124}
Note: See TracBrowser for help on using the repository browser.