source: gli/branches/2.81-fixed/src/org/greenstone/gatherer/util/RemoveContentBeforeRootElementXMLReader.java@ 18833

Last change on this file since 18833 was 18833, checked in by kjdon, 15 years ago

bug fix 18462 committed to branch - No longer expects ?xml to be the only acceptable xml opening marker, but also accepts DOCTYPE and any rootelement.

File size: 5.0 KB
Line 
1package org.greenstone.gatherer.util;
2
3import java.io.Reader;
4import java.io.IOException;
5
6import java.io.File;
7import java.io.FileReader;
8
9import java.util.regex.Pattern;
10
11public class RemoveContentBeforeRootElementXMLReader extends Reader {
12
13 static final Pattern[] xmlIndicators;
14 static Pattern commentStart;
15 static Pattern commentStop;
16
17 static {
18 // generate the xml starting sequences we will try to match against
19 xmlIndicators = new Pattern[3];
20 try{
21 xmlIndicators[0] = Pattern.compile("<\\?xml"); //usual xml declaration
22 xmlIndicators[1] = Pattern.compile("<!DOCTYPE"); //doctype declaration
23 xmlIndicators[2] = Pattern.compile("<[a-zA-Z-]+[ >/]"); // the beginning of a root node
24
25 commentStart = Pattern.compile("<!--");
26 commentStop = Pattern.compile("-->");
27
28 } catch ( java.util.regex.PatternSyntaxException pse ) {
29 System.err.println( "Pattern no good. " + pse );
30 xmlIndicators[0] = xmlIndicators[1] = xmlIndicators[2] = null;
31 commentStart = commentStop = null;
32 }
33 }
34
35 Reader ur;
36
37 String finalBuffer = null;
38 int finalBufferIndex = 0;
39
40 public RemoveContentBeforeRootElementXMLReader( Reader ur ) {
41
42 this.ur = ur;
43
44 int found = -1;
45 boolean inComment = false;
46 StringBuffer buffer = null;
47
48 for ( int c = 0; c != -1 && found == -1; ) {
49
50 //read a character
51 try {
52 c = ur.read();
53 } catch( Exception e ) {
54 System.err.println( "Exception while reading underlying Reader in RemoveContentBeforeRootElementXMLReader" );
55 }
56
57 //break out if we have reached the end of the input
58 if ( c == -1 ) {
59 break;
60 }
61
62 //we start buffering when we come across the first <
63 //regardless of whether it turns out to be a relevant < or not
64 if ( buffer == null && (char)c == '<' ) {
65 buffer = new StringBuffer();
66 }
67
68 //if not buffering, just display the character and move onto next character
69 if ( buffer == null ) {
70 System.err.print( (char)c );
71 continue;
72 }
73
74 buffer.append( (char)c );
75
76 //check for comment open or close
77 if ( !inComment ) {
78 if ( commentStart.matcher(buffer.toString()).find() ) {
79 inComment = true;
80 System.err.print( buffer.toString() );
81 buffer = new StringBuffer();
82 }
83 } else {
84 if ( commentStop.matcher(buffer.toString()).find() ) {
85 inComment = false;
86 System.err.print( buffer.toString() );
87 buffer = new StringBuffer();
88
89 //skip to reading next character
90 continue;
91 }
92 }
93
94 if ( !inComment ) {
95
96 //check each indicator to see if found
97 for ( int i = 0; i < xmlIndicators.length && found == -1; i++ ) {
98 if ( xmlIndicators[i].matcher(buffer.toString()).find() ) {
99 found = i;
100 String line = buffer.toString();
101 int lastIndex = line.lastIndexOf('<');
102 //flush the previous characters in the buffer to the console
103 System.err.print(line.substring(0, lastIndex));
104 buffer.delete(0, lastIndex);
105 finalBuffer = buffer.toString();
106 }
107 }
108
109 }
110 }
111
112 if ( found == -1 ) {
113 System.err.println( "RemoveContentBeforeRootElementXMLReader:\n" +
114 "The XML being loaded was not valid: couldn't find start of XML input" );
115 }
116
117 }
118
119 public int read( char[] cbuf, int off, int len ) throws IOException {
120
121 for ( int i=off; i<off+len && i<cbuf.length; i++ ) {
122
123 //read from underlying reader
124 int c = read();
125
126 //catch end of stream
127 if ( c == -1 ) {
128 if ( i == off ) {
129 return -1;
130 }
131 return i - off;
132 }
133
134 //insert character into the array
135 cbuf[i] = (char)c;
136 }
137 return len;
138
139 }
140
141 public int read() throws IOException {
142
143 //flush the buffer containing the opening XML sequence
144 if ( finalBuffer != null && finalBuffer.length() > finalBufferIndex ) {
145 char c = finalBuffer.charAt(finalBufferIndex++);
146 if ( finalBufferIndex == finalBuffer.length() ) {
147 finalBuffer = null;
148 }
149 return c;
150 }
151
152 return ur.read();
153 }
154
155 public void close() throws IOException {
156 ur.close();
157 }
158
159 public static void main ( String[] args ) {
160
161 //init
162 System.out.println( "------------\nWill now initialise the test reader\n------------" );
163 RemoveContentBeforeRootElementXMLReader parser = null;
164 try {
165 parser = new RemoveContentBeforeRootElementXMLReader(
166 new FileReader( new File("text.xml") ) );
167 } catch ( java.io.FileNotFoundException fnfe ) {
168 System.err.println( "Please create text.xml to test this class" );
169 System.exit(-1);
170 }
171
172 //read the rest of the input
173 System.out.println( "------------\nWill now read the rest of the input\n------------" );
174 try {
175 int c = 0;
176 while ( ( c = parser.read() ) != -1 ) {
177 System.out.print( (char)c );
178 }
179 } catch ( Exception e ) {
180 System.err.println("Exception: " + e);
181 }
182
183 }
184
185
186}
Note: See TracBrowser for help on using the repository browser.