[18170] | 1 | package org.greenstone.gatherer.util;
|
---|
| 2 |
|
---|
| 3 | import java.io.Reader;
|
---|
| 4 | import java.io.IOException;
|
---|
| 5 |
|
---|
[18833] | 6 | import java.io.File;
|
---|
| 7 | import java.io.FileReader;
|
---|
| 8 |
|
---|
| 9 | import java.util.regex.Pattern;
|
---|
| 10 |
|
---|
[18170] | 11 | public class RemoveContentBeforeRootElementXMLReader extends Reader {
|
---|
| 12 |
|
---|
[18833] | 13 | static final Pattern[] xmlIndicators;
|
---|
| 14 | static Pattern commentStart;
|
---|
| 15 | static Pattern commentStop;
|
---|
[18170] | 16 |
|
---|
[18833] | 17 | static {
|
---|
| 18 | // generate the xml starting sequences we will try to match against
|
---|
| 19 | xmlIndicators = new Pattern[3];
|
---|
| 20 | try{
|
---|
| 21 | xmlIndicators[0] = Pattern.compile("<\\?xml"); //usual xml declaration
|
---|
| 22 | xmlIndicators[1] = Pattern.compile("<!DOCTYPE"); //doctype declaration
|
---|
| 23 | xmlIndicators[2] = Pattern.compile("<[a-zA-Z-]+[ >/]"); // the beginning of a root node
|
---|
[18170] | 24 |
|
---|
[18833] | 25 | commentStart = Pattern.compile("<!--");
|
---|
| 26 | commentStop = Pattern.compile("-->");
|
---|
[18170] | 27 |
|
---|
[18833] | 28 | } catch ( java.util.regex.PatternSyntaxException pse ) {
|
---|
| 29 | System.err.println( "Pattern no good. " + pse );
|
---|
| 30 | xmlIndicators[0] = xmlIndicators[1] = xmlIndicators[2] = null;
|
---|
| 31 | commentStart = commentStop = null;
|
---|
| 32 | }
|
---|
| 33 | }
|
---|
[18170] | 34 |
|
---|
[18833] | 35 | Reader ur;
|
---|
[18170] | 36 |
|
---|
[18833] | 37 | String finalBuffer = null;
|
---|
| 38 | int finalBufferIndex = 0;
|
---|
[18170] | 39 |
|
---|
[18833] | 40 | public RemoveContentBeforeRootElementXMLReader( Reader ur ) {
|
---|
[18170] | 41 |
|
---|
[18833] | 42 | this.ur = ur;
|
---|
[18170] | 43 |
|
---|
[18833] | 44 | int found = -1;
|
---|
| 45 | boolean inComment = false;
|
---|
| 46 | StringBuffer buffer = null;
|
---|
[18170] | 47 |
|
---|
[18833] | 48 | for ( int c = 0; c != -1 && found == -1; ) {
|
---|
[18170] | 49 |
|
---|
[18833] | 50 | //read a character
|
---|
| 51 | try {
|
---|
| 52 | c = ur.read();
|
---|
| 53 | } catch( Exception e ) {
|
---|
| 54 | System.err.println( "Exception while reading underlying Reader in RemoveContentBeforeRootElementXMLReader" );
|
---|
| 55 | }
|
---|
[18170] | 56 |
|
---|
[18833] | 57 | //break out if we have reached the end of the input
|
---|
| 58 | if ( c == -1 ) {
|
---|
| 59 | break;
|
---|
| 60 | }
|
---|
[18170] | 61 |
|
---|
[18833] | 62 | //we start buffering when we come across the first <
|
---|
| 63 | //regardless of whether it turns out to be a relevant < or not
|
---|
| 64 | if ( buffer == null && (char)c == '<' ) {
|
---|
| 65 | buffer = new StringBuffer();
|
---|
| 66 | }
|
---|
[18170] | 67 |
|
---|
[18833] | 68 | //if not buffering, just display the character and move onto next character
|
---|
| 69 | if ( buffer == null ) {
|
---|
| 70 | System.err.print( (char)c );
|
---|
| 71 | continue;
|
---|
| 72 | }
|
---|
[18170] | 73 |
|
---|
[18833] | 74 | buffer.append( (char)c );
|
---|
[18170] | 75 |
|
---|
[18833] | 76 | //check for comment open or close
|
---|
| 77 | if ( !inComment ) {
|
---|
| 78 | if ( commentStart.matcher(buffer.toString()).find() ) {
|
---|
| 79 | inComment = true;
|
---|
| 80 | System.err.print( buffer.toString() );
|
---|
| 81 | buffer = new StringBuffer();
|
---|
| 82 | }
|
---|
| 83 | } else {
|
---|
| 84 | if ( commentStop.matcher(buffer.toString()).find() ) {
|
---|
| 85 | inComment = false;
|
---|
| 86 | System.err.print( buffer.toString() );
|
---|
| 87 | buffer = new StringBuffer();
|
---|
[18170] | 88 |
|
---|
[18833] | 89 | //skip to reading next character
|
---|
| 90 | continue;
|
---|
| 91 | }
|
---|
| 92 | }
|
---|
[18170] | 93 |
|
---|
[18833] | 94 | if ( !inComment ) {
|
---|
[18170] | 95 |
|
---|
[18833] | 96 | //check each indicator to see if found
|
---|
| 97 | for ( int i = 0; i < xmlIndicators.length && found == -1; i++ ) {
|
---|
| 98 | if ( xmlIndicators[i].matcher(buffer.toString()).find() ) {
|
---|
| 99 | found = i;
|
---|
| 100 | String line = buffer.toString();
|
---|
| 101 | int lastIndex = line.lastIndexOf('<');
|
---|
| 102 | //flush the previous characters in the buffer to the console
|
---|
| 103 | System.err.print(line.substring(0, lastIndex));
|
---|
| 104 | buffer.delete(0, lastIndex);
|
---|
| 105 | finalBuffer = buffer.toString();
|
---|
| 106 | }
|
---|
| 107 | }
|
---|
| 108 |
|
---|
| 109 | }
|
---|
| 110 | }
|
---|
| 111 |
|
---|
| 112 | if ( found == -1 ) {
|
---|
| 113 | System.err.println( "RemoveContentBeforeRootElementXMLReader:\n" +
|
---|
| 114 | "The XML being loaded was not valid: couldn't find start of XML input" );
|
---|
| 115 | }
|
---|
| 116 |
|
---|
| 117 | }
|
---|
| 118 |
|
---|
| 119 | public int read( char[] cbuf, int off, int len ) throws IOException {
|
---|
| 120 |
|
---|
| 121 | for ( int i=off; i<off+len && i<cbuf.length; i++ ) {
|
---|
| 122 |
|
---|
| 123 | //read from underlying reader
|
---|
| 124 | int c = read();
|
---|
| 125 |
|
---|
| 126 | //catch end of stream
|
---|
| 127 | if ( c == -1 ) {
|
---|
| 128 | if ( i == off ) {
|
---|
| 129 | return -1;
|
---|
| 130 | }
|
---|
| 131 | return i - off;
|
---|
| 132 | }
|
---|
| 133 |
|
---|
| 134 | //insert character into the array
|
---|
| 135 | cbuf[i] = (char)c;
|
---|
| 136 | }
|
---|
| 137 | return len;
|
---|
| 138 |
|
---|
| 139 | }
|
---|
| 140 |
|
---|
| 141 | public int read() throws IOException {
|
---|
| 142 |
|
---|
| 143 | //flush the buffer containing the opening XML sequence
|
---|
| 144 | if ( finalBuffer != null && finalBuffer.length() > finalBufferIndex ) {
|
---|
| 145 | char c = finalBuffer.charAt(finalBufferIndex++);
|
---|
| 146 | if ( finalBufferIndex == finalBuffer.length() ) {
|
---|
| 147 | finalBuffer = null;
|
---|
| 148 | }
|
---|
| 149 | return c;
|
---|
| 150 | }
|
---|
| 151 |
|
---|
| 152 | return ur.read();
|
---|
| 153 | }
|
---|
| 154 |
|
---|
| 155 | public void close() throws IOException {
|
---|
| 156 | ur.close();
|
---|
| 157 | }
|
---|
| 158 |
|
---|
| 159 | public static void main ( String[] args ) {
|
---|
| 160 |
|
---|
| 161 | //init
|
---|
| 162 | System.out.println( "------------\nWill now initialise the test reader\n------------" );
|
---|
| 163 | RemoveContentBeforeRootElementXMLReader parser = null;
|
---|
| 164 | try {
|
---|
| 165 | parser = new RemoveContentBeforeRootElementXMLReader(
|
---|
| 166 | new FileReader( new File("text.xml") ) );
|
---|
| 167 | } catch ( java.io.FileNotFoundException fnfe ) {
|
---|
| 168 | System.err.println( "Please create text.xml to test this class" );
|
---|
| 169 | System.exit(-1);
|
---|
| 170 | }
|
---|
| 171 |
|
---|
| 172 | //read the rest of the input
|
---|
| 173 | System.out.println( "------------\nWill now read the rest of the input\n------------" );
|
---|
| 174 | try {
|
---|
| 175 | int c = 0;
|
---|
| 176 | while ( ( c = parser.read() ) != -1 ) {
|
---|
| 177 | System.out.print( (char)c );
|
---|
| 178 | }
|
---|
| 179 | } catch ( Exception e ) {
|
---|
| 180 | System.err.println("Exception: " + e);
|
---|
| 181 | }
|
---|
| 182 |
|
---|
| 183 | }
|
---|
| 184 |
|
---|
| 185 |
|
---|
[18170] | 186 | }
|
---|