Index: /gli/branches/rtl-gli/src/org/greenstone/gatherer/util/RemoveContentBeforeRootElementXMLReader.java
===================================================================
--- /gli/branches/rtl-gli/src/org/greenstone/gatherer/util/RemoveContentBeforeRootElementXMLReader.java (revision 18353)
+++ /gli/branches/rtl-gli/src/org/greenstone/gatherer/util/RemoveContentBeforeRootElementXMLReader.java (revision 18353)
@@ -0,0 +1,92 @@
+package org.greenstone.gatherer.util;
+
+import java.io.Reader;
+import java.io.IOException;
+
+public class RemoveContentBeforeRootElementXMLReader extends Reader {
+
+ static final byte[] xmlIndicator = "";
static final public String FURTHER_DIALOG_INDICATOR = "...";
+ static final public String FEDORA_MODE= "-fedora";
+ static final public String FEDORA_HOME = "-fedora_home";
+ static final public String FEDORA_VERSION = "-fedora_version";
+ static final public String FEDORA_HOSTNAME = "-fedora_hostname";
+ static final public String FEDORA_PORT = "-fedora_port";
+ static final public String FEDORA_USERNAME = "-fedora_username";
+ static final public String FEDORA_PASSWORD = "-fedora_password";
+ static final public String FEDORA_PROTOCOL = "-fedora_protocol";
static final public String GLI_ATTRIBUTE = "gli";
static final public String GLISERVER_URL_ARGUMENT = "-gliserver_url";
@@ -144,5 +153,5 @@
static final public String INT_STR = "int";
static final public String IMPORT_STR = "import";
- static final public String[] KEEP_PLUG = { "GAPlug", "METSPlug" };
+ static final public String[] KEEP_PLUG = { "GreenstoneXMLPlugin", "GreenstoneMETSPlugin" };
static final public String LANGUAGE_ARGUMENT = "l=";
static final public String LANGUAGE_ATTRIBUTE = "language";
@@ -181,5 +190,5 @@
static final public String METADATA_TYPE_STR = "metadata";
static final public String METADATA_XML = "metadata.xml";
- static final public String METADATAXMLPLUG_STR = "MetadataXMLPlug";
+ static final public String METADATAXMLPLUG_STR = "MetadataXMLPlugin";
static final public String METADATUM_TYPE_STR = "metadatum";
static final public String MGPP_ATTRIBUTE = "mgpp_enabled";
@@ -207,5 +216,5 @@
static final public String PREDEFINED_METADATA_ATTRIBUTE = "predefined";
static final public String RBRACKET_CHARACTER = "]";
- static final public String RECPLUG_STR = "RecPlug";
+ static final public String RECPLUG_STR = "DirectoryPlugin";
static final public String REGEXP_STR = "regexp";
static final public String REPLACELISTREF_STR = "replaceListRef";
@@ -216,4 +225,5 @@
static final public String SECTION_ELEMENT = "Section";
static final public String SECTION_STR = "section";
+ static final public String SEPARATE_CJK_OPTION_STR = "separate_cjk";
static final public String SEPARATOR_ATTRIBUTE = "separator";
static final public String SEPARATOR_CHARACTER = "/";
@@ -246,5 +256,5 @@
static final public String TYPE_ATTRIBUTE = "type";
static final public String UNKNOWN_ELEMENT = "Unknown";
- static final public String UNKNOWNPLUG_STR = "UnknownPlug";
+ static final public String UNKNOWNPLUG_STR = "UnknownPlugin";
static final public String USE_METADATA_FILES_ARGUMENT = "use_metadata_files";
static final public String USE_REMOTE_GREENSTONE_ARGUMENT = "-use_remote_greenstone";
Index: /gli/branches/rtl-gli/src/org/greenstone/gatherer/util/XMLTools.java
===================================================================
--- /gli/branches/rtl-gli/src/org/greenstone/gatherer/util/XMLTools.java (revision 18352)
+++ /gli/branches/rtl-gli/src/org/greenstone/gatherer/util/XMLTools.java (revision 18353)
@@ -261,5 +261,5 @@
static final public String NOTWELLFORMED= "not well-formed";
static final private String HEADER = "";
- static final private String FOOTER = "";
+ static final private String FOOTER = "";
public static String parse (String xml_str) {
@@ -332,5 +332,5 @@
SAXParser parser = factory.newSAXParser ();
FileReader r = new FileReader(xml_file);
- InputSource iSource = new InputSource(r);
+ InputSource iSource = new InputSource(r);
XMLReader reader = parser.getXMLReader ();
reader.setContentHandler(new DefaultHandler());
@@ -372,4 +372,5 @@
} // getLocationString(SAXParseException):String
+
/** Parse an XML document from a given file path */
static public Document parseXMLFile (String xml_file_path, boolean use_class_loader) {
@@ -409,6 +410,5 @@
try {
InputStreamReader isr = new InputStreamReader (xml_input_stream, "UTF-8");
- Reader xml_reader = new BufferedReader (isr);
- document = parseXML (xml_reader);
+ document = parseXML(isr);
isr.close ();
xml_input_stream.close ();
@@ -425,19 +425,71 @@
static public Document parseXML (Reader xml_reader) {
Document document = null;
-
- try {
- InputSource isc = new InputSource (xml_reader);
- DOMParser parser = new DOMParser ();
- parser.setFeature ("http://xml.org/sax/features/validation", false);
- parser.setFeature ("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
- // May or may not be ignored, the documentation for Xerces is contradictory. If it works then parsing -should- be faster.
- parser.setFeature ("http://apache.org/xml/features/dom/defer-node-expansion", true);
- parser.setFeature ("http://apache.org/xml/features/dom/include-ignorable-whitespace", false);
- parser.parse (isc);
- document = parser.getDocument ();
- }
- catch (SAXException exception) {
- System.err.println ("SAX exception: " + exception.getMessage ());
- DebugStream.printStackTrace (exception);
+
+ // If debugging, the following will store the XML contents to be parsed,
+ // which can then be inspected upon encountering a SAXException (need to run GLI with -debug on)
+ String xmlContents = "";
+
+ try {
+ Reader reader = null;
+
+ // (1) By default, GLI will remove any contents preceeding (and invalidating)
+ // the XML and present these lines separately to the user
+ if(!DebugStream.isDebuggingEnabled()) {
+ try {
+ reader = new BufferedReader( new RemoveContentBeforeRootElementXMLReader(xml_reader) );
+ } catch ( Exception e ) {
+ System.err.println( "Exception while wrapping the reader in parseXML(Reader)" );
+ e.printStackTrace();
+ }
+ }
+
+ // (2) If we are running GLI in debug mode:
+ // In case parsing exceptions are thrown (SAX Exceptions), we want to get some
+ // idea of where things went wrong. This will print the "XML" contents to either
+ // system.out (if debugging is off) or to the DebugStream otherwise.
+ // We need to read the XML twice to know the line where things went wrong, so
+ // do the additional reading only if we're debugging
+ else {
+ StringBuffer buf = new StringBuffer();
+ char[] buffer = new char[500];
+ int numCharsRead = xml_reader.read(buffer, 0, buffer.length);
+ while(numCharsRead != -1) {
+ buf.append(buffer, 0, numCharsRead);
+ numCharsRead = xml_reader.read(buffer, 0, buffer.length);
+ }
+ xmlContents = buf.toString();
+ xml_reader.close(); // closing the old Reader
+ xml_reader = null;
+ buffer = null;
+ buf = null;
+ // we need a Reader to parse the same contents as the Reader that was just closed
+ reader = new BufferedReader(new StringReader(xmlContents));
+ }
+
+ // (2) The actual XML parsing
+ InputSource isc = new InputSource (reader);
+ DOMParser parser = new DOMParser ();
+ parser.setFeature ("http://xml.org/sax/features/validation", false);
+ parser.setFeature ("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
+ // May or may not be ignored, the documentation for Xerces is contradictory. If it works then parsing -should- be faster.
+ parser.setFeature ("http://apache.org/xml/features/dom/defer-node-expansion", true);
+ parser.setFeature ("http://apache.org/xml/features/dom/include-ignorable-whitespace", false);
+ parser.parse (isc);
+ document = parser.getDocument ();
+
+ } catch(SAXParseException e) {
+ showXMLParseFailureLine(e, xmlContents);
+ } catch (SAXException exception) {
+ System.err.println ("SAX exception: " + exception.getMessage ());
+ if(DebugStream.isDebuggingEnabled()) {
+ DebugStream.println("Encountered a SAX exception when parsing the following:\n*********START\n"
+ + xmlContents + "\n************END\n");
+ // Exit to let the user view the erroneous line/xml before it goes past the screen buffer?
+ DebugStream.println("Debug mode: Exiting the program as there was trouble parsing the XML...");
+ System.exit(-1);
+ }
+ // else, not running in debug mode, so don't exit after exception
+ System.out.println("***Turn debugging on (run GLI with -debug) to view the XML contents that could not be parsed.");
+ DebugStream.printStackTrace (exception);
}
catch (Exception exception) {
@@ -447,5 +499,51 @@
return document;
}
-
+
+ /** Displays the line (string) where the SAXParseException occurred, given a String of the
+ * entire xml that was being parsed and the SAXParseException object that was caught.
+ * The messages are printed to DebugStream, so run GLI/FLI with -debug to view this output.
+ * @param xmlContents is the entire xml that was being parsed when the exception occurred
+ * @param e is the SAXParseException object that was thrown upon parsing the xmlContents.
+ */
+ public static void showXMLParseFailureLine(SAXParseException e, String xmlContents) {
+
+ // There should be no characters at all that preceed the ... bit.
+ // The first check is for starting spaces:
+ if(xmlContents.startsWith("\n") || xmlContents.startsWith(" ") || xmlContents.startsWith("\t")) {
+ DebugStream.println("ERROR: illegal start of XML. Space/tab/newline should not preceed xml declaration.\n");
+ DebugStream.println("xmlContents (length is " + xmlContents.length() + "):\n" + xmlContents);
+ return; // nothing more to do, first error identified
+ }
+
+ // the actual line (String literal) where parsing failed and the SAXParseException occurred.
+ String line = "";
+ int linenumber = e.getLineNumber();
+ DebugStream.println("\n****SAXParseException on LINE NUMBER: " + linenumber);
+ if(DebugStream.isDebuggingEnabled()) {
+ if(linenumber != -1) {
+ // find the line in xmlContents string (xmlContents is only set if GLI is run with debugging turned on)
+ int start = 0;
+ int end = xmlContents.length();
+ for(int i = 1; i <= linenumber; i++) {
+ end = xmlContents.indexOf("\n");
+ if(end > 0) {
+ line = xmlContents.substring(start, end);
+ }
+ start = end+1;
+ }
+ DebugStream.println("The parsing error occurred on this line:\n***********START\n" + line + "\n***********END");
+ DebugStream.println("SAXParseException message: " + e.getMessage() + "\n");
+ } else { // no particular line number, print out all the xml so debugger can inspect it
+ DebugStream.println("Encountered a SAX exception when parsing the following:\n*********START\n"
+ + xmlContents + "\n************END\n");
+ }
+ // Exit to let the user view the erroneous line/xml before it goes past the screen buffer?
+ DebugStream.println("\nDebug mode: Exiting the program as there was trouble parsing the XML...");
+ System.exit(-1);
+ } else { // not running in debug mode
+ System.out.println("***Turn debugging on (run GLI with -debug) to view the XML contents/line that could not be parsed.");
+ }
+ }
+
static public StringBuffer readXMLStream (InputStream input_stream) {
@@ -540,6 +638,7 @@
f.setLineWidth (0); // Why isn't this working!
f.setPreserveSpace (false);
-
- f.setNonEscapingElements (nonEscapingTagNames);
+ if (nonEscapingTagNames != null) {
+ f.setNonEscapingElements (nonEscapingTagNames);
+ }
// Create the necessary writer stream for serialization.
OutputStreamWriter osw = new OutputStreamWriter (os, "UTF-8");
@@ -560,28 +659,7 @@
/** Write an XML document to a given file */
static public void writeXMLFile (File xml_file, Document document) {
- try {
- OutputStream os = new FileOutputStream (xml_file);
- // Create an output format for our document.
- OutputFormat f = new OutputFormat (document);
- f.setEncoding ("UTF-8");
- f.setIndenting (true);
- f.setLineWidth (0); // Why isn't this working!
- f.setPreserveSpace (false);
- // Create the necessary writer stream for serialization.
- OutputStreamWriter osw = new OutputStreamWriter (os, "UTF-8");
- Writer w = new BufferedWriter (osw);
- // Generate a new serializer from the above.
- XMLSerializer s = new XMLSerializer (w, f);
- s.asDOMSerializer ();
- // Finally serialize the document to file.
- s.serialize (document);
- // And close.
- os.close ();
- }
- catch (Exception exception) {
- DebugStream.printStackTrace (exception);
- }
- }
-
+ writeXMLFile(xml_file, document, null);
+ }
+
public static void printXMLNode (Node e) {
printXMLNode (e, 0) ;
@@ -686,4 +764,10 @@
}
+
+ public static String xmlNodeToStringWithoutIndenting (Node e) {
+ StringBuffer sb = new StringBuffer ("");
+ xmlNodeToStringWithoutNewline(sb, e, -1);
+ return sb.toString();
+ }
public static String xmlNodeToStringWithoutNewline (Node e){
StringBuffer sb = new StringBuffer ("");
@@ -696,5 +780,5 @@
for (int i=0 ; i= 0) {
xmlNodeToStringWithoutNewline (sb,children.item (i), depth + 1);
+ } else {
+ xmlNodeToStringWithoutNewline (sb,children.item (i), depth);
+ }
}
@@ -735,7 +823,4 @@
sb.append ("" + e.getNodeName () + ">");
}
-
-
- }
-
+ }
}