Context Navigation

source: gli/branches/2.81-fixed/src/org/greenstone/gatherer/util/RemoveContentBeforeRootElementXMLReader.java@ 18833

Last change on this file since 18833 was 18833, checked in by kjdon, 15 years ago
bug fix 18462 committed to branch - No longer expects ?xml to be the only acceptable xml opening marker, but also accepts DOCTYPE and any rootelement.
File size: 5.0 KB

Line
1	package org.greenstone.gatherer.util;
2
3	import java.io.Reader;
4	import java.io.IOException;
5
6	import java.io.File;
7	import java.io.FileReader;
8
9	import java.util.regex.Pattern;
10
11	public class RemoveContentBeforeRootElementXMLReader extends Reader {
12
13	static final Pattern[] xmlIndicators;
14	static Pattern commentStart;
15	static Pattern commentStop;
16
17	static {
18	// generate the xml starting sequences we will try to match against
19	xmlIndicators = new Pattern[3];
20	try{
21	xmlIndicators[0] = Pattern.compile("<\\?xml"); //usual xml declaration
22	xmlIndicators[1] = Pattern.compile("<!DOCTYPE"); //doctype declaration
23	xmlIndicators[2] = Pattern.compile("<[a-zA-Z-]+[ >/]"); // the beginning of a root node
24
25	commentStart = Pattern.compile("<!--");
26	commentStop = Pattern.compile("-->");
27
28	} catch ( java.util.regex.PatternSyntaxException pse ) {
29	System.err.println( "Pattern no good. " + pse );
30	xmlIndicators[0] = xmlIndicators[1] = xmlIndicators[2] = null;
31	commentStart = commentStop = null;
32	}
33	}
34
35	Reader ur;
36
37	String finalBuffer = null;
38	int finalBufferIndex = 0;
39
40	public RemoveContentBeforeRootElementXMLReader( Reader ur ) {
41
42	this.ur = ur;
43
44	int found = -1;
45	boolean inComment = false;
46	StringBuffer buffer = null;
47
48	for ( int c = 0; c != -1 && found == -1; ) {
49
50	//read a character
51	try {
52	c = ur.read();
53	} catch( Exception e ) {
54	System.err.println( "Exception while reading underlying Reader in RemoveContentBeforeRootElementXMLReader" );
55	}
56
57	//break out if we have reached the end of the input
58	if ( c == -1 ) {
59	break;
60	}
61
62	//we start buffering when we come across the first <
63	//regardless of whether it turns out to be a relevant < or not
64	if ( buffer == null && (char)c == '<' ) {
65	buffer = new StringBuffer();
66	}
67
68	//if not buffering, just display the character and move onto next character
69	if ( buffer == null ) {
70	System.err.print( (char)c );
71	continue;
72	}
73
74	buffer.append( (char)c );
75
76	//check for comment open or close
77	if ( !inComment ) {
78	if ( commentStart.matcher(buffer.toString()).find() ) {
79	inComment = true;
80	System.err.print( buffer.toString() );
81	buffer = new StringBuffer();
82	}
83	} else {
84	if ( commentStop.matcher(buffer.toString()).find() ) {
85	inComment = false;
86	System.err.print( buffer.toString() );
87	buffer = new StringBuffer();
88
89	//skip to reading next character
90	continue;
91	}
92	}
93
94	if ( !inComment ) {
95
96	//check each indicator to see if found
97	for ( int i = 0; i < xmlIndicators.length && found == -1; i++ ) {
98	if ( xmlIndicators[i].matcher(buffer.toString()).find() ) {
99	found = i;
100	String line = buffer.toString();
101	int lastIndex = line.lastIndexOf('<');
102	//flush the previous characters in the buffer to the console
103	System.err.print(line.substring(0, lastIndex));
104	buffer.delete(0, lastIndex);
105	finalBuffer = buffer.toString();
106	}
107	}
108
109	}
110	}
111
112	if ( found == -1 ) {
113	System.err.println( "RemoveContentBeforeRootElementXMLReader:\n" +
114	"The XML being loaded was not valid: couldn't find start of XML input" );
115	}
116
117	}
118
119	public int read( char[] cbuf, int off, int len ) throws IOException {
120
121	for ( int i=off; i<off+len && i<cbuf.length; i++ ) {
122
123	//read from underlying reader
124	int c = read();
125
126	//catch end of stream
127	if ( c == -1 ) {
128	if ( i == off ) {
129	return -1;
130	}
131	return i - off;
132	}
133
134	//insert character into the array
135	cbuf[i] = (char)c;
136	}
137	return len;
138
139	}
140
141	public int read() throws IOException {
142
143	//flush the buffer containing the opening XML sequence
144	if ( finalBuffer != null && finalBuffer.length() > finalBufferIndex ) {
145	char c = finalBuffer.charAt(finalBufferIndex++);
146	if ( finalBufferIndex == finalBuffer.length() ) {
147	finalBuffer = null;
148	}
149	return c;
150	}
151
152	return ur.read();
153	}
154
155	public void close() throws IOException {
156	ur.close();
157	}
158
159	public static void main ( String[] args ) {
160
161	//init
162	System.out.println( "------------\nWill now initialise the test reader\n------------" );
163	RemoveContentBeforeRootElementXMLReader parser = null;
164	try {
165	parser = new RemoveContentBeforeRootElementXMLReader(
166	new FileReader( new File("text.xml") ) );
167	} catch ( java.io.FileNotFoundException fnfe ) {
168	System.err.println( "Please create text.xml to test this class" );
169	System.exit(-1);
170	}
171
172	//read the rest of the input
173	System.out.println( "------------\nWill now read the rest of the input\n------------" );
174	try {
175	int c = 0;
176	while ( ( c = parser.read() ) != -1 ) {
177	System.out.print( (char)c );
178	}
179	} catch ( Exception e ) {
180	System.err.println("Exception: " + e);
181	}
182
183	}
184
185
186	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: