package org.greenstone.gatherer.util; import java.io.Reader; import java.io.IOException; import java.io.File; import java.io.FileReader; import java.util.regex.Pattern; public class RemoveContentBeforeRootElementXMLReader extends Reader { static final Pattern[] xmlIndicators; static Pattern commentStart; static Pattern commentStop; static { // generate the xml starting sequences we will try to match against xmlIndicators = new Pattern[3]; try{ xmlIndicators[0] = Pattern.compile("<\\?xml"); //usual xml declaration xmlIndicators[1] = Pattern.compile("/]"); // the beginning of a root node commentStart = Pattern.compile(""); } catch ( java.util.regex.PatternSyntaxException pse ) { System.err.println( "Pattern no good. " + pse ); xmlIndicators[0] = xmlIndicators[1] = xmlIndicators[2] = null; commentStart = commentStop = null; } } Reader ur; String finalBuffer = null; int finalBufferIndex = 0; public RemoveContentBeforeRootElementXMLReader( Reader ur ) { this.ur = ur; int found = -1; boolean inComment = false; StringBuffer buffer = null; for ( int c = 0; c != -1 && found == -1; ) { //read a character try { c = ur.read(); } catch( Exception e ) { System.err.println( "Exception while reading underlying Reader in RemoveContentBeforeRootElementXMLReader" ); } //break out if we have reached the end of the input if ( c == -1 ) { break; } //we start buffering when we come across the first < //regardless of whether it turns out to be a relevant < or not if ( buffer == null && (char)c == '<' ) { buffer = new StringBuffer(); } //if not buffering, just display the character and move onto next character if ( buffer == null ) { System.err.print( (char)c ); continue; } buffer.append( (char)c ); //check for comment open or close if ( !inComment ) { if ( commentStart.matcher(buffer.toString()).find() ) { inComment = true; System.err.print( buffer.toString() ); buffer = new StringBuffer(); } } else { if ( commentStop.matcher(buffer.toString()).find() ) { inComment = false; System.err.print( buffer.toString() ); buffer = new StringBuffer(); //skip to reading next character continue; } } if ( !inComment ) { //check each indicator to see if found for ( int i = 0; i < xmlIndicators.length && found == -1; i++ ) { if ( xmlIndicators[i].matcher(buffer.toString()).find() ) { found = i; String line = buffer.toString(); int lastIndex = line.lastIndexOf('<'); //flush the previous characters in the buffer to the console System.err.print(line.substring(0, lastIndex)); buffer.delete(0, lastIndex); finalBuffer = buffer.toString(); } } } } if ( found == -1 ) { System.err.println( "RemoveContentBeforeRootElementXMLReader:\n" + "The XML being loaded was not valid: couldn't find start of XML input" ); } } public int read( char[] cbuf, int off, int len ) throws IOException { for ( int i=off; i finalBufferIndex ) { char c = finalBuffer.charAt(finalBufferIndex++); if ( finalBufferIndex == finalBuffer.length() ) { finalBuffer = null; } return c; } return ur.read(); } public void close() throws IOException { ur.close(); } public static void main ( String[] args ) { //init System.out.println( "------------\nWill now initialise the test reader\n------------" ); RemoveContentBeforeRootElementXMLReader parser = null; try { parser = new RemoveContentBeforeRootElementXMLReader( new FileReader( new File("text.xml") ) ); } catch ( java.io.FileNotFoundException fnfe ) { System.err.println( "Please create text.xml to test this class" ); System.exit(-1); } //read the rest of the input System.out.println( "------------\nWill now read the rest of the input\n------------" ); try { int c = 0; while ( ( c = parser.read() ) != -1 ) { System.out.print( (char)c ); } } catch ( Exception e ) { System.err.println("Exception: " + e); } } }