source: other-projects/rsyntax-textarea/src/java/org/fife/io/UnicodeReader.java@ 25584

Last change on this file since 25584 was 25584, checked in by davidb, 12 years ago

Initial cut an a text edit area for GLI that supports color syntax highlighting

File size: 8.3 KB
Line 
1/*
2 * 09/23/2004
3 *
4 * UnicodeReader.java - A reader for Unicode input streams that is capable of
5 * discerning which particular encoding is being used via the BOM.
6 *
7 * This library is distributed under a modified BSD license. See the included
8 * RSyntaxTextArea.License.txt file for details.
9 */
10package org.fife.io;
11
12import java.io.File;
13import java.io.FileInputStream;
14import java.io.FileNotFoundException;
15import java.io.InputStream;
16import java.io.InputStreamReader;
17import java.io.IOException;
18import java.io.PushbackInputStream;
19import java.io.Reader;
20
21
22/**
23 * A reader capable of identifying Unicode streams by their BOMs. This class
24 * will recognize the following encodings:
25 * <ul>
26 * <li>UTF-8
27 * <li>UTF-16LE
28 * <li>UTF-16BE
29 * <li>UTF-32LE
30 * <li>UTF-32BE
31 * </ul>
32 * If the stream is not found to be any of the above, then a default encoding
33 * is used for reading. The user can specify this default encoding, or a system
34 * default will be used.<p>
35 *
36 * For optimum performance, it is recommended that you wrap all instances of
37 * <code>UnicodeReader</code> with a <code>java.io.BufferedReader</code>.<p>
38 *
39 * This class is mostly ripped off from the workaround in the description of
40 * Java Bug 4508058.
41 *
42 * @author Robert Futrell
43 * @version 0.9
44 */
45public class UnicodeReader extends Reader {
46
47 /**
48 * The input stream from which we're really reading.
49 */
50 private InputStreamReader internalIn = null;
51
52 /**
53 * The encoding being used. We keep our own instead of using the string
54 * returned by <code>java.io.InputStreamReader</code> since that class
55 * does not return user-friendly names.
56 */
57 private String encoding;
58
59 /**
60 * The size of a BOM.
61 */
62 private static final int BOM_SIZE = 4;
63
64
65 /**
66 * This utility constructor is here because you will usually use a
67 * <code>UnicodeReader</code> on files.<p>
68 * Creates a reader using the encoding specified by the BOM in the file;
69 * if there is no recognized BOM, then a system default encoding is used.
70 *
71 * @param file The file from which you want to read.
72 * @throws IOException If an error occurs when checking for/reading the
73 * BOM.
74 * @throws FileNotFoundException If the file does not exist, is a
75 * directory, or cannot be opened for reading.
76 * @throws SecurityException If a security manager exists and its
77 * checkRead method denies read access to the file.
78 */
79 public UnicodeReader(String file) throws IOException,
80 FileNotFoundException, SecurityException {
81 this(new File(file));
82 }
83
84
85 /**
86 * This utility constructor is here because you will usually use a
87 * <code>UnicodeReader</code> on files.<p>
88 * Creates a reader using the encoding specified by the BOM in the file;
89 * if there is no recognized BOM, then a system default encoding is used.
90 *
91 * @param file The file from which you want to read.
92 * @throws IOException If an error occurs when checking for/reading the
93 * BOM.
94 * @throws FileNotFoundException If the file does not exist, is a
95 * directory, or cannot be opened for reading.
96 * @throws SecurityException If a security manager exists and its
97 * checkRead method denies read access to the file.
98 */
99 public UnicodeReader(File file) throws IOException, FileNotFoundException,
100 SecurityException {
101 this(new FileInputStream(file));
102 }
103
104
105 /**
106 * This utility constructor is here because you will usually use a
107 * <code>UnicodeReader</code> on files.<p>
108 * Creates a reader using the encoding specified by the BOM in the file;
109 * if there is no recognized BOM, then a specified default encoding is
110 * used.
111 *
112 * @param file The file from which you want to read.
113 * @param defaultEncoding The encoding to use if no BOM is found. If
114 * this value is <code>null</code>, a system default is used.
115 * @throws IOException If an error occurs when checking for/reading the
116 * BOM.
117 * @throws FileNotFoundException If the file does not exist, is a
118 * directory, or cannot be opened for reading.
119 * @throws SecurityException If a security manager exists and its
120 * checkRead method denies read access to the file.
121 */
122 public UnicodeReader(File file, String defaultEncoding)
123 throws IOException, FileNotFoundException,
124 SecurityException {
125 this(new FileInputStream(file), defaultEncoding);
126 }
127
128
129 /**
130 * Creates a reader using the encoding specified by the BOM in the file;
131 * if there is no recognized BOM, then a system default encoding is used.
132 *
133 * @param in The input stream from which to read.
134 * @throws IOException If an error occurs when checking for/reading the
135 * BOM.
136 */
137 public UnicodeReader(InputStream in) throws IOException {
138 this(in, null);
139 }
140
141
142 /**
143 * Creates a reader using the encoding specified by the BOM in the file;
144 * if there is no recognized BOM, then <code>defaultEncoding</code> is
145 * used.
146 *
147 * @param in The input stream from which to read.
148 * @param defaultEncoding The encoding to use if no recognized BOM is
149 * found. If this value is <code>null</code>, a system default
150 * is used.
151 * @throws IOException If an error occurs when checking for/reading the
152 * BOM.
153 */
154 public UnicodeReader(InputStream in, String defaultEncoding)
155 throws IOException {
156 init(in, defaultEncoding);
157 }
158
159
160 /**
161 * Closes this reader.
162 */
163 public void close() throws IOException {
164 internalIn.close();
165 }
166
167
168 /**
169 * Returns the encoding being used to read this input stream (i.e., the
170 * encoding of the file). If a BOM was recognized, then the specific
171 * Unicode type is returned; otherwise, either the default encoding passed
172 * into the constructor or the system default is returned.
173 *
174 * @return The encoding of the stream.
175 */
176 public String getEncoding() {
177 return encoding;
178 }
179
180
181 /**
182 * Read-ahead four bytes and check for BOM marks. Extra bytes are
183 * unread back to the stream, only BOM bytes are skipped.
184 *
185 * @param defaultEncoding The encoding to use if no BOM was recognized. If
186 * this value is <code>null</code>, then a system default is used.
187 * @throws IOException If an error occurs when trying to read a BOM.
188 */
189 protected void init(InputStream in, String defaultEncoding)
190 throws IOException {
191
192 PushbackInputStream tempIn = new PushbackInputStream(in, BOM_SIZE);
193
194 byte bom[] = new byte[BOM_SIZE];
195 int n, unread;
196 n = tempIn.read(bom, 0, bom.length);
197
198 if ((bom[0]==(byte)0x00) && (bom[1]==(byte)0x00) &&
199 (bom[2]==(byte)0xFE) && (bom[3]==(byte)0xFF)) {
200 encoding = "UTF-32BE";
201 unread = n - 4;
202 }
203
204 else if (n==BOM_SIZE && // Last 2 bytes are 0; could be an empty UTF-16
205 (bom[0]==(byte)0xFF) && (bom[1]==(byte)0xFE) &&
206 (bom[2]==(byte)0x00) && (bom[3]==(byte)0x00)) {
207 encoding = "UTF-32LE";
208 unread = n - 4;
209 }
210
211 else if ((bom[0]==(byte)0xEF) &&
212 (bom[1]==(byte)0xBB) &&
213 (bom[2]==(byte)0xBF)) {
214 encoding = "UTF-8";
215 unread = n - 3;
216 }
217
218 else if ((bom[0]==(byte)0xFE) && (bom[1] == (byte)0xFF)) {
219 encoding = "UTF-16BE";
220 unread = n - 2;
221 }
222
223 else if ((bom[0]==(byte)0xFF) && (bom[1]== (byte)0xFE)) {
224 encoding = "UTF-16LE";
225 unread = n - 2;
226 }
227
228 else {
229 // Unicode BOM mark not found, unread all bytes
230 encoding = defaultEncoding;
231 unread = n;
232 }
233
234 if (unread > 0)
235 tempIn.unread(bom, (n - unread), unread);
236 else if (unread < -1)
237 tempIn.unread(bom, 0, 0);
238
239 // Use given encoding
240 if (encoding == null) {
241 internalIn = new InputStreamReader(tempIn);
242 encoding = internalIn.getEncoding(); // Get the default.
243 }
244 else {
245 internalIn = new InputStreamReader(tempIn, encoding);
246 }
247
248 }
249
250
251 /**
252 * Read characters into a portion of an array. This method will block until
253 * some input is available, an I/O error occurs, or the end of the stream
254 * is reached.
255 *
256 * @param cbuf The buffer into which to read.
257 * @param off The offset at which to start storing characters.
258 * @param len The maximum number of characters to read.
259 *
260 * @return The number of characters read, or <code>-1</code> if the end
261 * of the stream has been reached.
262 */
263 public int read(char[] cbuf, int off, int len) throws IOException {
264 return internalIn.read(cbuf, off, len);
265 }
266
267
268}
Note: See TracBrowser for help on using the repository browser.