source: gli/trunk/src/org/greenstone/gatherer/util/RemoveContentBeforeRootElementXMLReader.java@ 18462

Last change on this file since 18462 was 18462, checked in by oranfry, 15 years ago

No longer expects ?xml to be the only acceptable xml opening marker, but also accepts DOCTYPE and any rootelement.

File size: 5.0 KB
Line 
1package org.greenstone.gatherer.util;
2
3import java.io.Reader;
4import java.io.IOException;
5
6import java.io.File;
7import java.io.FileReader;
8
9import java.util.regex.Pattern;
10
11public class RemoveContentBeforeRootElementXMLReader extends Reader {
12
13 static final Pattern[] xmlIndicators;
14 static Pattern commentStart;
15 static Pattern commentStop;
16
17 static {
18 // generate the xml starting sequences we will try to match against
19 xmlIndicators = new Pattern[3];
20 try{
21 xmlIndicators[0] = Pattern.compile("<\\?xml"); //usual xml declaration
22 xmlIndicators[1] = Pattern.compile("<!DOCTYPE"); //doctype declaration
23 xmlIndicators[2] = Pattern.compile("<[a-zA-Z-]+[ >/]"); // the beginning of a root node
24
25 commentStart = Pattern.compile("<!--");
26 commentStop = Pattern.compile("-->");
27
28 } catch ( java.util.regex.PatternSyntaxException pse ) {
29 System.err.println( "Pattern no good. " + pse );
30 xmlIndicators[0] = xmlIndicators[1] = xmlIndicators[2] = null;
31 commentStart = commentStop = null;
32 }
33 }
34
35 Reader ur;
36
37 String finalBuffer = null;
38 int finalBufferIndex = 0;
39
40 public RemoveContentBeforeRootElementXMLReader( Reader ur ) {
41
42 this.ur = ur;
43
44 int found = -1;
45 boolean inComment = false;
46 StringBuffer buffer = null;
47
48 for ( int c = 0; c != -1 && found == -1; ) {
49
50 //read a character
51 try {
52 c = ur.read();
53 } catch( Exception e ) {
54 System.err.println( "Exception while reading underlying Reader in RemoveContentBeforeRootElementXMLReader" );
55 }
56
57 //break out if we have reached the end of the input
58 if ( c == -1 ) {
59 break;
60 }
61
62 //we start buffering when we come across the first <
63 //regardless of whether it turns out to be a relevant < or not
64 if ( buffer == null && (char)c == '<' ) {
65 buffer = new StringBuffer();
66 }
67
68 //if not buffering, just display the character and move onto next character
69 if ( buffer == null ) {
70 System.err.print( (char)c );
71 continue;
72 }
73
74 buffer.append( (char)c );
75
76 //check for comment open or close
77 if ( !inComment ) {
78 if ( commentStart.matcher(buffer.toString()).find() ) {
79 inComment = true;
80 System.err.print( buffer.toString() );
81 buffer = new StringBuffer();
82 }
83 } else {
84 if ( commentStop.matcher(buffer.toString()).find() ) {
85 inComment = false;
86 System.err.print( buffer.toString() );
87 buffer = new StringBuffer();
88
89 //skip to reading next character
90 continue;
91 }
92 }
93
94 if ( !inComment ) {
95
96 //check each indicator to see if found
97 for ( int i = 0; i < xmlIndicators.length && found == -1; i++ ) {
98 if ( xmlIndicators[i].matcher(buffer.toString()).find() ) {
99 found = i;
100 String line = buffer.toString();
101 int lastIndex = line.lastIndexOf('<');
102 //flush the previous characters in the buffer to the console
103 System.err.print(line.substring(0, lastIndex));
104 buffer.delete(0, lastIndex);
105 finalBuffer = buffer.toString();
106 }
107 }
108
109 }
110 }
111
112 if ( found == -1 ) {
113 System.err.println( "RemoveContentBeforeRootElementXMLReader:\n" +
114 "The XML being loaded was not valid: couldn't find start of XML input" );
115 }
116
117 }
118
119 public int read( char[] cbuf, int off, int len ) throws IOException {
120
121 for ( int i=off; i<off+len && i<cbuf.length; i++ ) {
122
123 //read from underlying reader
124 int c = read();
125
126 //catch end of stream
127 if ( c == -1 ) {
128 if ( i == off ) {
129 return -1;
130 }
131 return i - off;
132 }
133
134 //insert character into the array
135 cbuf[i] = (char)c;
136 }
137 return len;
138
139 }
140
141 public int read() throws IOException {
142
143 //flush the buffer containing the opening XML sequence
144 if ( finalBuffer != null && finalBuffer.length() > finalBufferIndex ) {
145 char c = finalBuffer.charAt(finalBufferIndex++);
146 if ( finalBufferIndex == finalBuffer.length() ) {
147 finalBuffer = null;
148 }
149 return c;
150 }
151
152 return ur.read();
153 }
154
155 public void close() throws IOException {
156 ur.close();
157 }
158
159 public static void main ( String[] args ) {
160
161 //init
162 System.out.println( "------------\nWill now initialise the test reader\n------------" );
163 RemoveContentBeforeRootElementXMLReader parser = null;
164 try {
165 parser = new RemoveContentBeforeRootElementXMLReader(
166 new FileReader( new File("text.xml") ) );
167 } catch ( java.io.FileNotFoundException fnfe ) {
168 System.err.println( "Please create text.xml to test this class" );
169 System.exit(-1);
170 }
171
172 //read the rest of the input
173 System.out.println( "------------\nWill now read the rest of the input\n------------" );
174 try {
175 int c = 0;
176 while ( ( c = parser.read() ) != -1 ) {
177 System.out.print( (char)c );
178 }
179 } catch ( Exception e ) {
180 System.err.println("Exception: " + e);
181 }
182
183 }
184
185
186}
Note: See TracBrowser for help on using the repository browser.