1 | package org.greenstone.gatherer.util;
|
---|
2 |
|
---|
3 | import java.io.*;
|
---|
4 | import org.greenstone.gatherer.Gatherer;
|
---|
5 |
|
---|
6 | public class DecodeHTMLReader
|
---|
7 | extends PushbackReader {
|
---|
8 |
|
---|
9 | public DecodeHTMLReader(Reader source) {
|
---|
10 | super(source, 4);
|
---|
11 | }
|
---|
12 |
|
---|
13 | /** Read a single character. */
|
---|
14 | public int read() {
|
---|
15 | return decode();
|
---|
16 | }
|
---|
17 |
|
---|
18 | /** Read characters into a portion of an array. */
|
---|
19 | public int read(char[] cbuf, int off, int len) {
|
---|
20 | int count = 0;
|
---|
21 | for(int i = off; i < len && ready(); i++) {
|
---|
22 | cbuf[i] = (char)decode();
|
---|
23 | count++;
|
---|
24 | }
|
---|
25 | return count;
|
---|
26 | }
|
---|
27 |
|
---|
28 | public boolean ready() {
|
---|
29 | try {
|
---|
30 | return super.ready();
|
---|
31 | }
|
---|
32 | catch (Exception error) {
|
---|
33 | }
|
---|
34 | return false;
|
---|
35 | }
|
---|
36 |
|
---|
37 | /** Retrieve the next character off the stream. Unfortunately I have to do this a character at a time (which is slow). I also have to keep in mind that if a suspect encoded character turns out not to be I have to replace the extra characters. */
|
---|
38 | private int decode() {
|
---|
39 | int character;
|
---|
40 | try {
|
---|
41 | character = super.read();
|
---|
42 | }
|
---|
43 | catch (Exception error) {
|
---|
44 | character = ' ';
|
---|
45 | }
|
---|
46 | try {
|
---|
47 | if(character == '&') {
|
---|
48 | int amp = super.read();
|
---|
49 | switch(amp) {
|
---|
50 | case 'a':
|
---|
51 | case 'A':
|
---|
52 | int ampap = super.read();
|
---|
53 | int ampapo = super.read();
|
---|
54 | int ampapos = super.read();
|
---|
55 | int ampapossemi = super.read();
|
---|
56 | if((ampap == 'p' || ampap == 'P') && (ampapo == 'o' || ampapo == 'O') && (ampapos == 's' || ampapos == 'S') && ampapossemi == ';') {
|
---|
57 | // Read an ' so return an apostrophy
|
---|
58 | return '\'';
|
---|
59 | }
|
---|
60 | // Not a apos. Return the characters removed in the correct order.
|
---|
61 | super.unread(ampapossemi);
|
---|
62 | super.unread(ampapos);
|
---|
63 | super.unread(ampapo);
|
---|
64 | super.unread(ampap);
|
---|
65 | break;
|
---|
66 | case 'g':
|
---|
67 | case 'G':
|
---|
68 | int ampgt = super.read();
|
---|
69 | int ampgtsemi = super.read();
|
---|
70 | if((ampgt == 't' || ampgt == 'T') && ampgtsemi == ';') {
|
---|
71 | return '>';
|
---|
72 | }
|
---|
73 | super.unread(ampgtsemi);
|
---|
74 | super.unread(ampgt);
|
---|
75 | break;
|
---|
76 | case 'l':
|
---|
77 | case 'L':
|
---|
78 | int amplt = super.read();
|
---|
79 | int ampltsemi = super.read();
|
---|
80 | if((amplt == 't' || amplt == 'T') && ampltsemi == ';') {
|
---|
81 | return '<';
|
---|
82 | }
|
---|
83 | super.unread(ampltsemi);
|
---|
84 | super.unread(amplt);
|
---|
85 | break;
|
---|
86 | case 'q':
|
---|
87 | case 'Q':
|
---|
88 | int ampqu = super.read();
|
---|
89 | int ampquo = super.read();
|
---|
90 | int ampquot = super.read();
|
---|
91 | int ampquotsemi = super.read();
|
---|
92 | if((ampqu == 'u' || ampqu == 'U') && (ampquo == 'o' || ampquo == 'O') && (ampquot == 't' || ampquot == 'T') && ampquotsemi == ';') {
|
---|
93 | return '\"';
|
---|
94 | }
|
---|
95 | super.unread(ampquotsemi);
|
---|
96 | super.unread(ampquot);
|
---|
97 | super.unread(ampquo);
|
---|
98 | super.unread(ampqu);
|
---|
99 | break;
|
---|
100 | case '#':
|
---|
101 | int amphash = super.read();
|
---|
102 | int amphash3 = super.read();
|
---|
103 | int amphash39 = super.read();
|
---|
104 | int amphash39semi = super.read();
|
---|
105 | if(amphash == '#' && amphash3 == '3' && amphash39 == '9' && amphash39semi == ';') {
|
---|
106 | return '\'';
|
---|
107 | }
|
---|
108 | super.unread(amphash39semi);
|
---|
109 | super.unread(amphash39);
|
---|
110 | super.unread(amphash3);
|
---|
111 | super.unread(amphash);
|
---|
112 | break;
|
---|
113 | }
|
---|
114 | // Not a suspect. Return the character removed.
|
---|
115 | super.unread(amp);
|
---|
116 | }
|
---|
117 | }
|
---|
118 | catch (Exception error) {
|
---|
119 | Gatherer.printStackTrace(error);
|
---|
120 | }
|
---|
121 | // Nothing special. Simply return the character extracted.
|
---|
122 | return character;
|
---|
123 | }
|
---|
124 | }
|
---|