import java.io.*; import java.util.ArrayList; public class ByteFixer { private File fin ; private ArrayList Queue; private byte[] OverrunByte; private DataInputStream reader; public ByteFixer(String FileName) { Queue = new ArrayList(); fin = new File(FileName); try{ reader = new DataInputStream(new FileInputStream(fin)); } catch(FileNotFoundException e){ System.err.println("Input file not found"); return; } } public void close(){ try { reader.close(); } catch (IOException e) { e.printStackTrace(); } } public void fixCharacter() throws UnsupportedEncodingException{ boolean done_ansel_quick_fix = false; if (Queue.size() == 1) { byte first_char = Queue.get(0); if (first_char == 0xB0) { done_ansel_quick_fix = true; Queue.set(0,(byte)'`'); } else if (first_char == 0xFE) { done_ansel_quick_fix = true; Queue.set(0,(byte)'\''); } } if (done_ansel_quick_fix) { return; } //Copy Queue to byte array byte[] Character = new byte[Queue.size()]; for(int i=0; i < Queue.size(); i++) Character[i]=Queue.get(i); //Convert to ISO8859 String str = new String(Character,"ISO-8859-1"); byte[] UTF8 = str.getBytes("UTF-8"); System.err.println("FIXED TO: " + new String(UTF8, "UTF-8")); Queue = new ArrayList(); for(int i=0; i < UTF8.length; i++) Queue.add(UTF8[i]); return; } public byte[] getNextCharacter(){ int head; if(OverrunByte!=null){ byte[] Queue_Bytes = new byte[1]; Queue_Bytes[0] = OverrunByte[0]; OverrunByte = null; return Queue_Bytes; } try{ head = reader.readUnsignedByte(); if((head & 0x80)==0){ //Simple ASCII case, 1-byte sequence Queue.add((byte)head); } else{ //If >1-byte sequence Queue.add((byte)head); if ((head & 0xC0) != 0xC0) { // Isn't the start of a valid UTF-8 sequence fixCharacter(); } else { head = head << 1; while((head & 0x80)!=0){ int next = reader.readUnsignedByte(); if((next & 0xC0)==0x80) Queue.add((byte)next); else{ //If sequence end String s = Integer.toBinaryString(next); System.err.println("Encountered non-UTF8 character " + s); //Convert to UTF8 fixCharacter(); //Save overrun byte OverrunByte = new byte[1]; OverrunByte[0] = (byte)next; break; } head = head << 1; } } } }catch(EOFException e){ System.out.println("End of file reached"); return null; }catch(Exception e){ e.printStackTrace(); } //Copy Queue to byte array byte[] Queue_Bytes = new byte[Queue.size()]; for(int i=0; i < Queue.size(); i++){ Queue_Bytes[i] = Queue.get(i); } //Reset queue to be empty Queue = new ArrayList(); return Queue_Bytes; } }