Changeset 18462
- Timestamp:
- 2009-02-04T16:42:36+13:00 (14 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gli/trunk/src/org/greenstone/gatherer/util/RemoveContentBeforeRootElementXMLReader.java
r18195 r18462 4 4 import java.io.IOException; 5 5 6 import java.io.File; 7 import java.io.FileReader; 8 9 import java.util.regex.Pattern; 10 6 11 public class RemoveContentBeforeRootElementXMLReader extends Reader { 7 12 8 static final byte[] xmlIndicator = "<?xml".getBytes(); 13 static final Pattern[] xmlIndicators; 14 static Pattern commentStart; 15 static Pattern commentStop; 9 16 10 Reader ur; 11 int foundBytesReturned = 0; 12 boolean allFoundBytesReturned = false; //redundant, but may help performance 17 static { 18 // generate the xml starting sequences we will try to match against 19 xmlIndicators = new Pattern[3]; 20 try{ 21 xmlIndicators[0] = Pattern.compile("<\\?xml"); //usual xml declaration 22 xmlIndicators[1] = Pattern.compile("<!DOCTYPE"); //doctype declaration 23 xmlIndicators[2] = Pattern.compile("<[a-zA-Z-]+[ >/]"); // the beginning of a root node 24 25 commentStart = Pattern.compile("<!--"); 26 commentStop = Pattern.compile("-->"); 27 28 } catch ( java.util.regex.PatternSyntaxException pse ) { 29 System.err.println( "Pattern no good. " + pse ); 30 xmlIndicators[0] = xmlIndicators[1] = xmlIndicators[2] = null; 31 commentStart = commentStop = null; 32 } 33 } 34 35 Reader ur; 36 37 String finalBuffer = null; 38 int finalBufferIndex = 0; 39 40 public RemoveContentBeforeRootElementXMLReader( Reader ur ) { 41 42 this.ur = ur; 43 44 int found = -1; 45 boolean inComment = false; 46 StringBuffer buffer = null; 47 48 for ( int c = 0; c != -1 && found == -1; ) { 49 50 //read a character 51 try { 52 c = ur.read(); 53 } catch( Exception e ) { 54 System.err.println( "Exception while reading underlying Reader in RemoveContentBeforeRootElementXMLReader" ); 55 } 56 57 //break out if we have reached the end of the input 58 if ( c == -1 ) { 59 break; 60 } 61 62 //we start buffering when we come across the first < 63 //regardless of whether it turns out to be a relevant < or not 64 if ( buffer == null && (char)c == '<' ) { 65 buffer = new StringBuffer(); 66 } 67 68 //if not buffering, just display the character and move onto next character 69 if ( buffer == null ) { 70 System.err.print( (char)c ); 71 continue; 72 } 73 74 buffer.append( (char)c ); 75 76 //check for comment open or close 77 if ( !inComment ) { 78 if ( commentStart.matcher(buffer.toString()).find() ) { 79 inComment = true; 80 System.err.print( buffer.toString() ); 81 buffer = new StringBuffer(); 82 } 83 } else { 84 if ( commentStop.matcher(buffer.toString()).find() ) { 85 inComment = false; 86 System.err.print( buffer.toString() ); 87 buffer = new StringBuffer(); 88 89 //skip to reading next character 90 continue; 91 } 92 } 93 94 if ( !inComment ) { 95 96 //check each indicator to see if found 97 for ( int i = 0; i < xmlIndicators.length && found == -1; i++ ) { 98 if ( xmlIndicators[i].matcher(buffer.toString()).find() ) { 99 found = i; 100 String line = buffer.toString(); 101 int lastIndex = line.lastIndexOf('<'); 102 //flush the previous characters in the buffer to the console 103 System.err.print(line.substring(0, lastIndex)); 104 buffer.delete(0, lastIndex); 105 finalBuffer = buffer.toString(); 106 } 107 } 108 109 } 110 } 111 112 if ( found == -1 ) { 113 System.err.println( "RemoveContentBeforeRootElementXMLReader:\n" + 114 "The XML being loaded was not valid: couldn't find start of XML input" ); 115 } 116 117 } 118 119 public int read( char[] cbuf, int off, int len ) throws IOException { 120 121 for ( int i=off; i<off+len && i<cbuf.length; i++ ) { 122 123 //read from underlying reader 124 int c = read(); 125 126 //catch end of stream 127 if ( c == -1 ) { 128 if ( i == off ) { 129 return -1; 130 } 131 return i - off; 132 } 133 134 //insert character into the array 135 cbuf[i] = (char)c; 136 } 137 return len; 138 139 } 140 141 public int read() throws IOException { 142 143 //flush the buffer containing the opening XML sequence 144 if ( finalBuffer != null && finalBuffer.length() > finalBufferIndex ) { 145 char c = finalBuffer.charAt(finalBufferIndex++); 146 if ( finalBufferIndex == finalBuffer.length() ) { 147 finalBuffer = null; 148 } 149 return c; 150 } 151 152 return ur.read(); 153 } 154 155 public void close() throws IOException { 156 ur.close(); 157 } 158 159 public static void main ( String[] args ) { 160 161 //init 162 System.out.println( "------------\nWill now initialise the test reader\n------------" ); 163 RemoveContentBeforeRootElementXMLReader parser = null; 164 try { 165 parser = new RemoveContentBeforeRootElementXMLReader( 166 new FileReader( new File("text.xml") ) ); 167 } catch ( java.io.FileNotFoundException fnfe ) { 168 System.err.println( "Please create text.xml to test this class" ); 169 System.exit(-1); 170 } 171 172 //read the rest of the input 173 System.out.println( "------------\nWill now read the rest of the input\n------------" ); 174 try { 175 int c = 0; 176 while ( ( c = parser.read() ) != -1 ) { 177 System.out.print( (char)c ); 178 } 179 } catch ( Exception e ) { 180 System.err.println("Exception: " + e); 181 } 182 183 } 13 184 14 185 15 public RemoveContentBeforeRootElementXMLReader( Reader ur ) {16 17 this.ur = ur;18 19 //read up to the xml indicator20 int foundBytes = 0;21 int c = 0;22 while ( c != -1 ) {23 try {24 c = ur.read();25 } catch( Exception e ) {26 System.err.println( "Exception while reading underlying Reader in RemoveContentBeforeRootElementXMLReader" );27 }28 29 if ( c == (int)xmlIndicator[foundBytes] ) {30 foundBytes++;31 } else {32 if ( foundBytes != 0 ) {33 for ( int i=0; i<foundBytes; i++ ) {34 System.out.print( (char)xmlIndicator[i] );35 }36 foundBytes = 0;37 }38 if ( c != -1 ) {39 System.out.print( (char)c );40 }41 }42 43 if ( foundBytes == xmlIndicator.length ) {44 return;45 }46 47 }48 49 System.err.println( "RemoveContentBeforeRootElementXMLReader:\n" +50 "The XML being loaded did not contain the '<?xml' string as expected" );51 }52 53 public int read( char[] cbuf, int off, int len ) throws IOException {54 55 for ( int i=off; i<off+len && i<cbuf.length; i++ ) {56 57 //read from underlying reader58 int c = read();59 60 //catch end of stream61 if ( c == -1 ) {62 if ( i == off ) {63 return -1;64 }65 return i - off;66 }67 68 //insert character into the array69 cbuf[i] = (char)c;70 }71 return len;72 73 }74 75 public int read() throws IOException {76 77 if ( allFoundBytesReturned ) {78 return ur.read();79 }80 81 if ( foundBytesReturned < xmlIndicator.length ) {82 return xmlIndicator[foundBytesReturned++];83 }84 85 allFoundBytesReturned = true;86 return ur.read();87 }88 89 public void close() throws IOException {90 ur.close();91 }92 186 }
Note:
See TracChangeset
for help on using the changeset viewer.