1 | /*
|
---|
2 | * 09/23/2004
|
---|
3 | *
|
---|
4 | * UnicodeReader.java - A reader for Unicode input streams that is capable of
|
---|
5 | * discerning which particular encoding is being used via the BOM.
|
---|
6 | *
|
---|
7 | * This library is distributed under a modified BSD license. See the included
|
---|
8 | * RSyntaxTextArea.License.txt file for details.
|
---|
9 | */
|
---|
10 | package org.fife.io;
|
---|
11 |
|
---|
12 | import java.io.File;
|
---|
13 | import java.io.FileInputStream;
|
---|
14 | import java.io.FileNotFoundException;
|
---|
15 | import java.io.InputStream;
|
---|
16 | import java.io.InputStreamReader;
|
---|
17 | import java.io.IOException;
|
---|
18 | import java.io.PushbackInputStream;
|
---|
19 | import java.io.Reader;
|
---|
20 |
|
---|
21 |
|
---|
22 | /**
|
---|
23 | * A reader capable of identifying Unicode streams by their BOMs. This class
|
---|
24 | * will recognize the following encodings:
|
---|
25 | * <ul>
|
---|
26 | * <li>UTF-8
|
---|
27 | * <li>UTF-16LE
|
---|
28 | * <li>UTF-16BE
|
---|
29 | * <li>UTF-32LE
|
---|
30 | * <li>UTF-32BE
|
---|
31 | * </ul>
|
---|
32 | * If the stream is not found to be any of the above, then a default encoding
|
---|
33 | * is used for reading. The user can specify this default encoding, or a system
|
---|
34 | * default will be used.<p>
|
---|
35 | *
|
---|
36 | * For optimum performance, it is recommended that you wrap all instances of
|
---|
37 | * <code>UnicodeReader</code> with a <code>java.io.BufferedReader</code>.<p>
|
---|
38 | *
|
---|
39 | * This class is mostly ripped off from the workaround in the description of
|
---|
40 | * Java Bug 4508058.
|
---|
41 | *
|
---|
42 | * @author Robert Futrell
|
---|
43 | * @version 0.9
|
---|
44 | */
|
---|
45 | public class UnicodeReader extends Reader {
|
---|
46 |
|
---|
47 | /**
|
---|
48 | * The input stream from which we're really reading.
|
---|
49 | */
|
---|
50 | private InputStreamReader internalIn = null;
|
---|
51 |
|
---|
52 | /**
|
---|
53 | * The encoding being used. We keep our own instead of using the string
|
---|
54 | * returned by <code>java.io.InputStreamReader</code> since that class
|
---|
55 | * does not return user-friendly names.
|
---|
56 | */
|
---|
57 | private String encoding;
|
---|
58 |
|
---|
59 | /**
|
---|
60 | * The size of a BOM.
|
---|
61 | */
|
---|
62 | private static final int BOM_SIZE = 4;
|
---|
63 |
|
---|
64 |
|
---|
65 | /**
|
---|
66 | * This utility constructor is here because you will usually use a
|
---|
67 | * <code>UnicodeReader</code> on files.<p>
|
---|
68 | * Creates a reader using the encoding specified by the BOM in the file;
|
---|
69 | * if there is no recognized BOM, then a system default encoding is used.
|
---|
70 | *
|
---|
71 | * @param file The file from which you want to read.
|
---|
72 | * @throws IOException If an error occurs when checking for/reading the
|
---|
73 | * BOM.
|
---|
74 | * @throws FileNotFoundException If the file does not exist, is a
|
---|
75 | * directory, or cannot be opened for reading.
|
---|
76 | * @throws SecurityException If a security manager exists and its
|
---|
77 | * checkRead method denies read access to the file.
|
---|
78 | */
|
---|
79 | public UnicodeReader(String file) throws IOException,
|
---|
80 | FileNotFoundException, SecurityException {
|
---|
81 | this(new File(file));
|
---|
82 | }
|
---|
83 |
|
---|
84 |
|
---|
85 | /**
|
---|
86 | * This utility constructor is here because you will usually use a
|
---|
87 | * <code>UnicodeReader</code> on files.<p>
|
---|
88 | * Creates a reader using the encoding specified by the BOM in the file;
|
---|
89 | * if there is no recognized BOM, then a system default encoding is used.
|
---|
90 | *
|
---|
91 | * @param file The file from which you want to read.
|
---|
92 | * @throws IOException If an error occurs when checking for/reading the
|
---|
93 | * BOM.
|
---|
94 | * @throws FileNotFoundException If the file does not exist, is a
|
---|
95 | * directory, or cannot be opened for reading.
|
---|
96 | * @throws SecurityException If a security manager exists and its
|
---|
97 | * checkRead method denies read access to the file.
|
---|
98 | */
|
---|
99 | public UnicodeReader(File file) throws IOException, FileNotFoundException,
|
---|
100 | SecurityException {
|
---|
101 | this(new FileInputStream(file));
|
---|
102 | }
|
---|
103 |
|
---|
104 |
|
---|
105 | /**
|
---|
106 | * This utility constructor is here because you will usually use a
|
---|
107 | * <code>UnicodeReader</code> on files.<p>
|
---|
108 | * Creates a reader using the encoding specified by the BOM in the file;
|
---|
109 | * if there is no recognized BOM, then a specified default encoding is
|
---|
110 | * used.
|
---|
111 | *
|
---|
112 | * @param file The file from which you want to read.
|
---|
113 | * @param defaultEncoding The encoding to use if no BOM is found. If
|
---|
114 | * this value is <code>null</code>, a system default is used.
|
---|
115 | * @throws IOException If an error occurs when checking for/reading the
|
---|
116 | * BOM.
|
---|
117 | * @throws FileNotFoundException If the file does not exist, is a
|
---|
118 | * directory, or cannot be opened for reading.
|
---|
119 | * @throws SecurityException If a security manager exists and its
|
---|
120 | * checkRead method denies read access to the file.
|
---|
121 | */
|
---|
122 | public UnicodeReader(File file, String defaultEncoding)
|
---|
123 | throws IOException, FileNotFoundException,
|
---|
124 | SecurityException {
|
---|
125 | this(new FileInputStream(file), defaultEncoding);
|
---|
126 | }
|
---|
127 |
|
---|
128 |
|
---|
129 | /**
|
---|
130 | * Creates a reader using the encoding specified by the BOM in the file;
|
---|
131 | * if there is no recognized BOM, then a system default encoding is used.
|
---|
132 | *
|
---|
133 | * @param in The input stream from which to read.
|
---|
134 | * @throws IOException If an error occurs when checking for/reading the
|
---|
135 | * BOM.
|
---|
136 | */
|
---|
137 | public UnicodeReader(InputStream in) throws IOException {
|
---|
138 | this(in, null);
|
---|
139 | }
|
---|
140 |
|
---|
141 |
|
---|
142 | /**
|
---|
143 | * Creates a reader using the encoding specified by the BOM in the file;
|
---|
144 | * if there is no recognized BOM, then <code>defaultEncoding</code> is
|
---|
145 | * used.
|
---|
146 | *
|
---|
147 | * @param in The input stream from which to read.
|
---|
148 | * @param defaultEncoding The encoding to use if no recognized BOM is
|
---|
149 | * found. If this value is <code>null</code>, a system default
|
---|
150 | * is used.
|
---|
151 | * @throws IOException If an error occurs when checking for/reading the
|
---|
152 | * BOM.
|
---|
153 | */
|
---|
154 | public UnicodeReader(InputStream in, String defaultEncoding)
|
---|
155 | throws IOException {
|
---|
156 | init(in, defaultEncoding);
|
---|
157 | }
|
---|
158 |
|
---|
159 |
|
---|
160 | /**
|
---|
161 | * Closes this reader.
|
---|
162 | */
|
---|
163 | public void close() throws IOException {
|
---|
164 | internalIn.close();
|
---|
165 | }
|
---|
166 |
|
---|
167 |
|
---|
168 | /**
|
---|
169 | * Returns the encoding being used to read this input stream (i.e., the
|
---|
170 | * encoding of the file). If a BOM was recognized, then the specific
|
---|
171 | * Unicode type is returned; otherwise, either the default encoding passed
|
---|
172 | * into the constructor or the system default is returned.
|
---|
173 | *
|
---|
174 | * @return The encoding of the stream.
|
---|
175 | */
|
---|
176 | public String getEncoding() {
|
---|
177 | return encoding;
|
---|
178 | }
|
---|
179 |
|
---|
180 |
|
---|
181 | /**
|
---|
182 | * Read-ahead four bytes and check for BOM marks. Extra bytes are
|
---|
183 | * unread back to the stream, only BOM bytes are skipped.
|
---|
184 | *
|
---|
185 | * @param defaultEncoding The encoding to use if no BOM was recognized. If
|
---|
186 | * this value is <code>null</code>, then a system default is used.
|
---|
187 | * @throws IOException If an error occurs when trying to read a BOM.
|
---|
188 | */
|
---|
189 | protected void init(InputStream in, String defaultEncoding)
|
---|
190 | throws IOException {
|
---|
191 |
|
---|
192 | PushbackInputStream tempIn = new PushbackInputStream(in, BOM_SIZE);
|
---|
193 |
|
---|
194 | byte bom[] = new byte[BOM_SIZE];
|
---|
195 | int n, unread;
|
---|
196 | n = tempIn.read(bom, 0, bom.length);
|
---|
197 |
|
---|
198 | if ((bom[0]==(byte)0x00) && (bom[1]==(byte)0x00) &&
|
---|
199 | (bom[2]==(byte)0xFE) && (bom[3]==(byte)0xFF)) {
|
---|
200 | encoding = "UTF-32BE";
|
---|
201 | unread = n - 4;
|
---|
202 | }
|
---|
203 |
|
---|
204 | else if (n==BOM_SIZE && // Last 2 bytes are 0; could be an empty UTF-16
|
---|
205 | (bom[0]==(byte)0xFF) && (bom[1]==(byte)0xFE) &&
|
---|
206 | (bom[2]==(byte)0x00) && (bom[3]==(byte)0x00)) {
|
---|
207 | encoding = "UTF-32LE";
|
---|
208 | unread = n - 4;
|
---|
209 | }
|
---|
210 |
|
---|
211 | else if ((bom[0]==(byte)0xEF) &&
|
---|
212 | (bom[1]==(byte)0xBB) &&
|
---|
213 | (bom[2]==(byte)0xBF)) {
|
---|
214 | encoding = "UTF-8";
|
---|
215 | unread = n - 3;
|
---|
216 | }
|
---|
217 |
|
---|
218 | else if ((bom[0]==(byte)0xFE) && (bom[1] == (byte)0xFF)) {
|
---|
219 | encoding = "UTF-16BE";
|
---|
220 | unread = n - 2;
|
---|
221 | }
|
---|
222 |
|
---|
223 | else if ((bom[0]==(byte)0xFF) && (bom[1]== (byte)0xFE)) {
|
---|
224 | encoding = "UTF-16LE";
|
---|
225 | unread = n - 2;
|
---|
226 | }
|
---|
227 |
|
---|
228 | else {
|
---|
229 | // Unicode BOM mark not found, unread all bytes
|
---|
230 | encoding = defaultEncoding;
|
---|
231 | unread = n;
|
---|
232 | }
|
---|
233 |
|
---|
234 | if (unread > 0)
|
---|
235 | tempIn.unread(bom, (n - unread), unread);
|
---|
236 | else if (unread < -1)
|
---|
237 | tempIn.unread(bom, 0, 0);
|
---|
238 |
|
---|
239 | // Use given encoding
|
---|
240 | if (encoding == null) {
|
---|
241 | internalIn = new InputStreamReader(tempIn);
|
---|
242 | encoding = internalIn.getEncoding(); // Get the default.
|
---|
243 | }
|
---|
244 | else {
|
---|
245 | internalIn = new InputStreamReader(tempIn, encoding);
|
---|
246 | }
|
---|
247 |
|
---|
248 | }
|
---|
249 |
|
---|
250 |
|
---|
251 | /**
|
---|
252 | * Read characters into a portion of an array. This method will block until
|
---|
253 | * some input is available, an I/O error occurs, or the end of the stream
|
---|
254 | * is reached.
|
---|
255 | *
|
---|
256 | * @param cbuf The buffer into which to read.
|
---|
257 | * @param off The offset at which to start storing characters.
|
---|
258 | * @param len The maximum number of characters to read.
|
---|
259 | *
|
---|
260 | * @return The number of characters read, or <code>-1</code> if the end
|
---|
261 | * of the stream has been reached.
|
---|
262 | */
|
---|
263 | public int read(char[] cbuf, int off, int len) throws IOException {
|
---|
264 | return internalIn.read(cbuf, off, len);
|
---|
265 | }
|
---|
266 |
|
---|
267 |
|
---|
268 | } |
---|