/**
*#########################################################################
*
* A component of the Gatherer application, part of the Greenstone digital
* library suite from the New Zealand Digital Library Project at the
* University of Waikato, New Zealand.
*
* Author: John Thompson, Greenstone Digital Library, University of Waikato
*
* Copyright (C) 1999 New Zealand Digital Library Project
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*########################################################################
*/
package org.greenstone.gatherer.util;
import java.util.*;
/** Provides a standard, extensible way to convert from one format of string to another (given that each format has differing requirements regarding legal characters and escaped characters)
* @author John Thompson, Greenstone Digital Library, University of Waikato
* @version 2.3d
*/
public class Codec {
static final public String DECODE_PATH = "DECODE_PATH";
static final public String DECODE_SQUARE_BRACKETS = "DECODE_SQUARE_BRACKETS";
static final public String DOM_TO_GREENSTONE = "DOM_TO_GREENSTONE";
static final public String DOM_TO_TEXT = "DOM_TO_TEXT";
static final public String ENCODE_PATH = "ENCODE_PATH";
static final public String ENCODE_SQUARE_BRACKETS = "ENCODE_SQUARE_BRACKETS";
static final public String ESCAPEDHTML_TO_UNESCAPED = "ESCAPEDHTML_TO_UNESCAPED";
static final public String GREENSTONE_TO_DOM = "GREENSTONE_TO_DOM";
static final public String GREENSTONE_TO_TEXT = "GREENSTONE_TO_TEXT";
static final public String TEXT_TO_DOM = "TEXT_TO_DOM";
static final public String TEXT_TO_GREENSTONE = "TEXT_TO_GREENSTONE";
static final public String TEXT_TO_REGEXP = "TEXT_TO_REGEXP";
static final public String TEXT_TO_SHELL_UNIX = "TEXT_TO_SHELL_UNIX";
static final public String TEXT_TO_SHELL_WINDOWS = "TEXT_TO_SHELL_WINDOWS";
static final private int MAX_CACHE_SIZE = 100;
static private HashMap TRANSFORMS;
static private HashMap3D CACHE;
/** Static function called to construct TRANSFORMS mappings */
static {
TRANSFORMS = new HashMap();
String[] decode_path = {
"\\|", "\\\\",
"|", "\\|"
};
TRANSFORMS.put(DECODE_PATH, decode_path);
decode_path = null;
// Transform text into text, but without [ and ]
String[] decode_square_brackets = {
"[", "\\[",
"]", "\\]"
};
TRANSFORMS.put(DECODE_SQUARE_BRACKETS, decode_square_brackets);
decode_square_brackets = null;
// Translate DOM encoded text into Greenstone encoding
String[] dom_to_greenstone = {
"'", "\\\\\'",
">", ">",
"<", "<",
""", "\\\\\"",
"&", "&"
};
// removed "\n", "\\\\n", - config files are allowed new lines
// added "\\|", "\\\\"
TRANSFORMS.put(DOM_TO_GREENSTONE, dom_to_greenstone);
dom_to_greenstone = null;
// Transform DOM encoded text into plain text
String[] dom_to_text = {
"[", "\\[",
"]", "\\]",
"'", "\'",
">", ">",
"<", "<",
""", "\"",
"&", "&"
};
TRANSFORMS.put(DOM_TO_TEXT, dom_to_text);
dom_to_text = null;
// Transform text into a regular expression that will match it
String[] text_to_regexp = {
"\\\\", "\\\\\\\\",
"\\(", "\\\\(",
"\\)", "\\\\)",
"\\[", "\\\\[",
"\\]", "\\\\]",
"\\{", "\\\\{",
"\\}", "\\\\}",
"\\.", "\\\\."
};
TRANSFORMS.put(TEXT_TO_REGEXP, text_to_regexp);
text_to_regexp = null;
String[] encode_path = {
"\\|", "|",
"\\\\", "\\|"
};
TRANSFORMS.put(ENCODE_PATH, encode_path);
encode_path = null;
// Transform text into text, but without [ and ]
String[] encode_square_brackets = {
"\\[", "[",
"\\]", "]"
};
TRANSFORMS.put(ENCODE_SQUARE_BRACKETS, encode_square_brackets);
encode_square_brackets = null;
// Transform Greenstone encoded text to DOM encoding
String[] greenstone_to_dom = {
"&", "&",
"<", "<",
">", ">",
"\\\\\"", """,
"\\\\\'", "'",
"\"", """,
"\'", "'"
};
// removed"\\\\n", "\n", added "\\\\", "\\|"
TRANSFORMS.put(GREENSTONE_TO_DOM, greenstone_to_dom);
greenstone_to_dom = null;
// Transform Greenstone encoded text to plain text
String[] greenstone_to_text = {
"\\\\\"", "\"",
"\\\\\'", "\'",
""", "\"",
"'", "\'",
"[", "\\[",
"]", "\\]"
};
// removed "\\\\n", "\n", "\\|", "\\\\"
TRANSFORMS.put(GREENSTONE_TO_TEXT, greenstone_to_text);
greenstone_to_text = null;
// Transform plain html text into something that can be placed in a DOM
String[] text_to_dom = {
"&", "&",
"<", "<",
">", ">",
"\"", """,
"\'", "'"
};
TRANSFORMS.put(TEXT_TO_DOM, text_to_dom);
text_to_dom = null;
// Unescape html (or xml) text
String[] escapedhtml_to_unescaped = {
"&", "&",
"<", "<",
">", ">",
""", "\""//,
//"'", "\'"
};
TRANSFORMS.put(ESCAPEDHTML_TO_UNESCAPED, escapedhtml_to_unescaped);
escapedhtml_to_unescaped = null;
// Transform plain html text into greenstone encoding
String[] text_to_greenstone = {
"\\[", "[",
"\\]", "]",
"\"", """,
"\n", "\\\\n"
};
// "\'", "'",
// removed "\\\\", "\\|",
TRANSFORMS.put(TEXT_TO_GREENSTONE, text_to_greenstone);
text_to_greenstone = null;
// Transform plain html text into something that can be placed in a shell command
String[] text_to_shell_unix = {
"\"", "\\\\\"",
"\'", "\\\\\'",
"\n", "\\\\n"
};
TRANSFORMS.put(TEXT_TO_SHELL_UNIX, text_to_shell_unix);
text_to_shell_unix = null;
// Transform plain html text into something that can be placed in a shell command. Windows requires twice as many escaped for speech marks to be passed to underlying processes
String[] text_to_shell_windows = {
"\"", "\\\\\\\\\\\\\"",
"\'", "\\\\\'",
"\n", "\\\\n"
};
TRANSFORMS.put(TEXT_TO_SHELL_WINDOWS, text_to_shell_windows);
text_to_shell_windows = null;
CACHE = new HashMap3D();
}
static public String transform(String raw, String transform) {
if(raw == null) {
return raw;
}
// System.err.println("Transforming by "+transform+":\n" + raw);
String processed = (String) CACHE.get(transform, raw);
if(processed == null) {
processed = raw;
String[] transforms = (String[]) TRANSFORMS.get(transform);
if(transforms != null) {
for(int i = 0; i < transforms.length; i = i + 2) {
String target = transforms[i];
String result = transforms[i+1];
processed = processed.replaceAll(target, result);
}
}
//DebugStream.println("\n*** Transform: " + transform + " ***");
//DebugStream.println("*** Raw : '" + raw + "'");
//DebugStream.println("*** Processed: '" + processed + "'");
// If cache is at maximum size, empty it and start again
if(CACHE.size() == MAX_CACHE_SIZE) {
CACHE.clear();
}
CACHE.put(transform, raw, processed);
}
return processed;
}
/** Transform either of the accepted unicode escape sequences styles from in the string into single characters */
static final private char AND_CHAR = '&';
static final private char ESCAPE_CHAR = '\\';
static final private char HASH_CHAR = '#';
static final private char LOWER_U_CHAR = 'u';
static final private char UPPER_U_CHAR = 'U';
static final private char SEMICOLON_CHAR = ';';
static public String transformUnicode(String raw) {
StringBuffer processed = new StringBuffer();
int index = 0;
int raw_length = raw.length();
while(index < raw_length) {
char c0 = raw.charAt(index);
switch(c0) {
case AND_CHAR:
if(index + 1 < raw_length) {
// First the HTML ç type
char c1 = raw.charAt(index + 1);
if(c1 == HASH_CHAR) {
StringBuffer number_str = new StringBuffer();
char c2;
int offset = 2;
while(index + offset < raw_length && (c2 = raw.charAt(index + offset)) != SEMICOLON_CHAR) {
number_str.append(c2);
offset++;
}
// We've either run out of characters or have parsed a number
if(index + offset < raw_length && raw.charAt(index + offset) == SEMICOLON_CHAR) {
int number = Integer.parseInt(number_str.toString());
processed.append((char)number);
index = index + offset;
number_str = null;
break;
}
number_str = null;
}
}
processed.append(c0);
break;
case ESCAPE_CHAR:
// Now the \u00e7 type
if(index + 1 < raw_length) {
char c3 = raw.charAt(index + 1);
if((c3 == UPPER_U_CHAR || c3 == LOWER_U_CHAR) && index + 5 < raw_length) {
// We read four digits
String hex_str = raw.substring(index + 2, index + 6);
int number = Integer.parseInt(hex_str, 16);
hex_str = null;
processed.append((char)number);
index = index + 5;
break;
}
}
processed.append(c0);
break;
default:
processed.append(c0);
}
index++;
}
return processed.toString();
}
static public void main(String[] args) {
if(args.length < 2) {
String processed;
String raw;
String transform;
System.err.println("Running Test Suite");
transform = "DOM_TO_GREENSTONE";
System.err.println("Test " + transform);
raw = "A <\nand a <a href="here.html"><font size='2'>URL</font></a>";
System.err.println("Raw: '" + raw + "'");
processed = transform(raw, transform);
System.err.println("Processed: '" + processed + "'");
transform = "DOM_TO_TEXT";
System.err.println("Test " + transform);
raw = "A <\nand a <a href="here.html"><font size='2'>URL</font></a>";
System.err.println("Raw: '" + raw + "'");
processed = transform(raw, transform);
System.err.println("Processed: '" + processed + "'");
transform = "GREENSTONE_TO_DOM";
System.err.println("Test " + transform);
raw = "A <\\nand a ";
System.err.println("Raw: '" + raw + "'");
processed = transform(raw, transform);
System.err.println("Processed: '" + processed + "'");
transform = "GREENSTONE_TO_TEXT";
System.err.println("Test " + transform);
raw = "These \\[ \\] should be escaped, and so should \\\\ that. These " ' \\n are encoded.";
System.err.println("Raw: '" + raw + "'");
processed = transform(raw, transform);
System.err.println("Processed: '" + processed + "'");
transform = "TEXT_TO_DOM";
System.err.println("Test " + transform);
raw = "A <\nand a URL";
System.err.println("Raw: '" + raw + "'");
processed = transform(raw, transform);
System.err.println("Processed: '" + processed + "'");
transform = "TEXT_TO_GREENSTONE";
System.err.println("Test " + transform);
raw = "These [ ] should be escaped, and so should \\ that. These \" \' \n are encoded.";
System.err.println("Raw: '" + raw + "'");
processed = transform(raw, transform);
System.err.println("Processed: '" + processed + "'");
transform = "TEXT_TO_SHELL";
System.err.println("Test " + transform);
if(Utility.isWindows()) {
System.err.println("[Windows Version]");
transform = "TEXT_TO_SHELL_WINDOWS";
}
else {
System.err.println("[Unix Version]");
transform = "TEXT_TO_SHELL_UNIX";
}
raw = "A <\nand a URL";
System.err.println("Raw: '" + raw + "'");
processed = transform(raw, transform);
System.err.println("Processed: '" + processed + "'");
System.err.println("***** UNICODE TEST *****");
System.err.println("\\u0030 => " + transformUnicode("\\u0030"));
System.err.println("\\u0041 => " + transformUnicode("\\u0041"));
System.err.println("\\u007a => " + transformUnicode("\\u007a"));
System.err.println("\\u00e7 => " + transformUnicode("\\u00e7"));
System.err.println("0 => " + transformUnicode("0"));
System.err.println("A => " + transformUnicode("A"));
System.err.println("z => " + transformUnicode("z"));
System.err.println("ç => " + transformUnicode("ç"));
}
else {
System.err.println("Raw: '" + args[0] + "'");
System.err.println("Transform: " + args[1]);
String processed = transform(args[0], args[1]);
System.err.println("Processed: '" + processed + "'");
}
}
}