source: other-projects/UTF8-Fix/src/ByteFixer.java@ 32225

Last change on this file since 32225 was 32225, checked in by ak19, 6 years ago

Committing Dr Bainbridge's (Eclipse) project UTF8-Fix which contains Java code that fixes up files that aren't fully UTF-8, but can contain stray non-UTF-8 chars like Latin-1.

File size: 2.8 KB
Line 
1import java.io.*;
2import java.util.ArrayList;
3
4
5public class ByteFixer {
6 private File fin ;
7 private ArrayList<Byte> Queue;
8 private byte[] OverrunByte;
9 private DataInputStream reader;
10 public ByteFixer(String FileName) {
11 Queue = new ArrayList<Byte>();
12 fin = new File(FileName);
13 try{
14 reader = new DataInputStream(new FileInputStream(fin));
15 } catch(FileNotFoundException e){
16 System.err.println("Input file not found");
17 return;
18 }
19 }
20
21 public void close(){
22 try {
23 reader.close();
24 } catch (IOException e) {
25 e.printStackTrace();
26 }
27 }
28 public void fixCharacter() throws UnsupportedEncodingException{
29
30 boolean done_ansel_quick_fix = false;
31
32
33 if (Queue.size() == 1) {
34 byte first_char = Queue.get(0);
35 if (first_char == 0xB0) {
36 done_ansel_quick_fix = true;
37 Queue.set(0,(byte)'`');
38 }
39 else if (first_char == 0xFE) {
40 done_ansel_quick_fix = true;
41 Queue.set(0,(byte)'\'');
42 }
43 }
44
45 if (done_ansel_quick_fix) {
46 return;
47 }
48
49 //Copy Queue to byte array
50 byte[] Character = new byte[Queue.size()];
51 for(int i=0; i < Queue.size(); i++)
52 Character[i]=Queue.get(i);
53
54 //Convert to ISO8859
55 String str = new String(Character,"ISO-8859-1");
56
57 byte[] UTF8 = str.getBytes("UTF-8");
58
59 System.err.println("FIXED TO: " + new String(UTF8, "UTF-8"));
60
61 Queue = new ArrayList<Byte>();
62 for(int i=0; i < UTF8.length; i++)
63 Queue.add(UTF8[i]);
64 return;
65 }
66
67 public byte[] getNextCharacter(){
68
69 int head;
70
71 if(OverrunByte!=null){
72 byte[] Queue_Bytes = new byte[1];
73 Queue_Bytes[0] = OverrunByte[0];
74 OverrunByte = null;
75 return Queue_Bytes;
76 }
77
78 try{
79 head = reader.readUnsignedByte();
80 if((head & 0x80)==0){
81 //Simple ASCII case, 1-byte sequence
82 Queue.add((byte)head);
83 }
84 else{
85 //If >1-byte sequence
86
87 Queue.add((byte)head);
88
89 if ((head & 0xC0) != 0xC0) {
90 // Isn't the start of a valid UTF-8 sequence
91 fixCharacter();
92 }
93 else {
94 head = head << 1;
95 while((head & 0x80)!=0){
96 int next = reader.readUnsignedByte();
97 if((next & 0xC0)==0x80)
98 Queue.add((byte)next);
99 else{
100 //If sequence end
101 String s = Integer.toBinaryString(next);
102 System.err.println("Encountered non-UTF8 character " + s);
103 //Convert to UTF8
104 fixCharacter();
105 //Save overrun byte
106 OverrunByte = new byte[1];
107 OverrunByte[0] = (byte)next;
108 break;
109 }
110 head = head << 1;
111 }
112 }
113 }
114 }catch(EOFException e){
115 System.out.println("End of file reached");
116 return null;
117 }catch(Exception e){
118 e.printStackTrace();
119 }
120 //Copy Queue to byte array
121 byte[] Queue_Bytes = new byte[Queue.size()];
122 for(int i=0; i < Queue.size(); i++){
123 Queue_Bytes[i] = Queue.get(i);
124 }
125 //Reset queue to be empty
126 Queue = new ArrayList<Byte>();
127 return Queue_Bytes;
128 }
129
130}
Note: See TracBrowser for help on using the repository browser.