Changeset 18833


Ignore:
Timestamp:
03/30/09 12:58:55 (12 years ago)
Author:
kjdon
Message:

bug fix 18462 committed to branch - No longer expects ?xml to be the only acceptable xml opening marker, but also accepts DOCTYPE and any rootelement.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gli/branches/2.81-fixed/src/org/greenstone/gatherer/util/RemoveContentBeforeRootElementXMLReader.java

    r18195 r18833  
    44import java.io.IOException;
    55
     6import java.io.File;
     7import java.io.FileReader;
     8
     9import java.util.regex.Pattern;
     10
    611public class RemoveContentBeforeRootElementXMLReader extends Reader {
    712
    8     static final byte[] xmlIndicator = "<?xml".getBytes();
     13  static final Pattern[] xmlIndicators;
     14  static Pattern commentStart;
     15  static Pattern commentStop;
    916
    10     Reader ur;
    11     int foundBytesReturned = 0;
    12     boolean allFoundBytesReturned = false; //redundant, but may help performance
     17  static {
     18    // generate the xml starting sequences we will try to match against
     19    xmlIndicators = new Pattern[3];
     20    try{
     21      xmlIndicators[0] = Pattern.compile("<\\?xml"); //usual xml declaration
     22      xmlIndicators[1] = Pattern.compile("<!DOCTYPE"); //doctype declaration
     23      xmlIndicators[2] = Pattern.compile("<[a-zA-Z-]+[ >/]"); // the beginning of a root node
     24
     25      commentStart = Pattern.compile("<!--");
     26      commentStop = Pattern.compile("-->");
     27
     28    } catch ( java.util.regex.PatternSyntaxException pse ) {
     29        System.err.println( "Pattern no good. " + pse );
     30        xmlIndicators[0] = xmlIndicators[1] = xmlIndicators[2] = null;
     31        commentStart = commentStop = null;
     32    }
     33  }
     34
     35  Reader ur;
     36
     37  String finalBuffer = null;
     38  int finalBufferIndex = 0;
     39
     40  public RemoveContentBeforeRootElementXMLReader( Reader ur ) {
     41
     42    this.ur = ur;
     43
     44    int found = -1;
     45    boolean inComment = false;
     46    StringBuffer buffer = null;
     47
     48    for ( int c = 0;  c != -1 && found == -1; ) {
     49
     50      //read a character
     51      try {
     52        c = ur.read();
     53      } catch( Exception e ) {
     54        System.err.println( "Exception while reading underlying Reader in RemoveContentBeforeRootElementXMLReader" );
     55      }
     56
     57      //break out if we have reached the end of the input
     58      if ( c == -1 ) {
     59        break;
     60      }
     61
     62      //we start buffering when we come across the first <
     63      //regardless of whether it turns out to be a relevant < or not
     64      if ( buffer == null && (char)c == '<' )  {
     65        buffer = new StringBuffer();
     66      }
     67
     68      //if not buffering, just display the character and move onto next character
     69      if ( buffer == null ) {
     70         System.err.print( (char)c );
     71        continue;
     72      }
     73
     74      buffer.append( (char)c );
     75
     76      //check for comment open or close
     77      if ( !inComment ) {
     78        if ( commentStart.matcher(buffer.toString()).find() ) {
     79          inComment = true;
     80          System.err.print( buffer.toString() );
     81          buffer = new StringBuffer();
     82        }
     83      } else {
     84        if ( commentStop.matcher(buffer.toString()).find() ) {
     85          inComment = false;
     86          System.err.print( buffer.toString() );
     87          buffer = new StringBuffer();
     88
     89          //skip to reading next character
     90          continue;
     91        }
     92      }
     93
     94      if ( !inComment ) {
     95
     96        //check each indicator to see if found
     97        for ( int i = 0; i < xmlIndicators.length && found == -1; i++ ) {
     98          if ( xmlIndicators[i].matcher(buffer.toString()).find() ) {
     99            found = i;
     100            String line = buffer.toString();
     101            int lastIndex = line.lastIndexOf('<');
     102            //flush the previous characters in the buffer to the console
     103            System.err.print(line.substring(0, lastIndex));
     104            buffer.delete(0, lastIndex);
     105            finalBuffer = buffer.toString();
     106          }
     107        }
     108
     109      }
     110    }
     111
     112    if ( found == -1 ) {
     113      System.err.println( "RemoveContentBeforeRootElementXMLReader:\n" +
     114        "The XML being loaded was not valid: couldn't find start of XML input" );
     115    }
     116 
     117  }
     118
     119  public int read( char[] cbuf, int off, int len ) throws IOException {
     120
     121    for ( int i=off; i<off+len && i<cbuf.length; i++ ) {
     122
     123      //read from underlying reader
     124      int c = read();
     125
     126      //catch end of stream
     127      if ( c == -1 ) {
     128        if ( i == off ) {
     129          return -1;
     130        }
     131        return i - off;
     132      }
     133
     134      //insert character into the array
     135      cbuf[i] = (char)c;
     136    }
     137    return len;
     138
     139  }
     140
     141  public int read() throws IOException {
     142
     143    //flush the buffer containing the opening XML sequence
     144    if ( finalBuffer != null && finalBuffer.length() > finalBufferIndex ) {
     145      char c = finalBuffer.charAt(finalBufferIndex++);
     146      if ( finalBufferIndex == finalBuffer.length() ) {
     147        finalBuffer = null;
     148      }
     149      return c;
     150    }
     151
     152    return ur.read();
     153  }
     154
     155  public void close() throws IOException {
     156    ur.close();
     157  }
     158
     159  public static void main ( String[] args ) {
     160
     161    //init
     162    System.out.println( "------------\nWill now initialise the test reader\n------------" );
     163    RemoveContentBeforeRootElementXMLReader parser = null;
     164    try {
     165      parser = new RemoveContentBeforeRootElementXMLReader(
     166        new FileReader( new File("text.xml") ) );
     167    } catch ( java.io.FileNotFoundException fnfe ) {
     168      System.err.println( "Please create text.xml to test this class" );
     169      System.exit(-1);
     170    }
     171
     172    //read the rest of the input
     173    System.out.println( "------------\nWill now read the rest of the input\n------------" );
     174    try {
     175      int c = 0;
     176      while ( ( c = parser.read() ) != -1 ) {
     177        System.out.print( (char)c );
     178      }
     179    } catch ( Exception e ) {
     180      System.err.println("Exception: " + e);
     181    }
     182   
     183  }
    13184
    14185
    15     public RemoveContentBeforeRootElementXMLReader( Reader ur ) {
    16 
    17         this.ur = ur;
    18 
    19         //read up to the xml indicator
    20         int foundBytes = 0;
    21         int c = 0;
    22         while ( c != -1 ) {
    23             try {
    24                 c = ur.read();
    25             } catch( Exception e ) {
    26                 System.err.println( "Exception while reading underlying Reader in RemoveContentBeforeRootElementXMLReader" );
    27             }
    28            
    29             if ( c == (int)xmlIndicator[foundBytes] ) {
    30                 foundBytes++;
    31             } else {               
    32                 if ( foundBytes != 0 ) {
    33                 for ( int i=0; i<foundBytes; i++ ) {
    34                     System.out.print( (char)xmlIndicator[i] );
    35                 }
    36                 foundBytes = 0;
    37                 }
    38                 if ( c != -1 ) {
    39                 System.out.print( (char)c );
    40                 }
    41             }
    42 
    43             if ( foundBytes == xmlIndicator.length ) {
    44                 return;
    45             }
    46 
    47         }
    48        
    49         System.err.println( "RemoveContentBeforeRootElementXMLReader:\n" +
    50             "The XML being loaded did not contain the '<?xml' string as expected" );
    51     }
    52 
    53     public int read( char[] cbuf, int off, int len ) throws IOException {
    54 
    55         for ( int i=off; i<off+len && i<cbuf.length; i++ ) {
    56 
    57             //read from underlying reader
    58             int c = read();
    59 
    60             //catch end of stream
    61             if ( c == -1 ) {
    62                 if ( i == off ) {
    63                     return -1;
    64                 }
    65                 return i - off;
    66             }
    67 
    68             //insert character into the array
    69             cbuf[i] = (char)c;
    70         }
    71         return len;
    72 
    73     }
    74 
    75     public int read() throws IOException {
    76 
    77         if ( allFoundBytesReturned ) {
    78             return ur.read();
    79         }
    80 
    81         if ( foundBytesReturned < xmlIndicator.length ) {
    82             return xmlIndicator[foundBytesReturned++];
    83         }
    84 
    85         allFoundBytesReturned = true;
    86         return ur.read();
    87     }
    88 
    89     public void close() throws IOException {
    90         ur.close();
    91     }
    92186}
Note: See TracChangeset for help on using the changeset viewer.