/** *######################################################################### * * A component of the Gatherer application, part of the Greenstone digital * library suite from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * * Author: John Thompson, Greenstone Digital Library, University of Waikato * * Copyright (C) 1999 New Zealand Digital Library Project * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *######################################################################## */ package org.greenstone.gatherer.util; import java.util.*; /** Provides a standard, extensible way to convert from one format of string to another (given that each format has differing requirements regarding legal characters and escaped characters) * @author John Thompson, Greenstone Digital Library, University of Waikato * @version 2.3d */ public class Codec { static final public String DECODE_PATH = "DECODE_PATH"; static final public String DECODE_SQUARE_BRACKETS = "DECODE_SQUARE_BRACKETS"; static final public String DOM_TO_GREENSTONE = "DOM_TO_GREENSTONE"; static final public String DOM_TO_TEXT = "DOM_TO_TEXT"; static final public String ENCODE_PATH = "ENCODE_PATH"; static final public String ENCODE_SQUARE_BRACKETS = "ENCODE_SQUARE_BRACKETS"; static final public String ESCAPEDHTML_TO_UNESCAPED = "ESCAPEDHTML_TO_UNESCAPED"; static final public String GREENSTONE_TO_DOM = "GREENSTONE_TO_DOM"; static final public String GREENSTONE_TO_TEXT = "GREENSTONE_TO_TEXT"; static final public String TEXT_TO_DOM = "TEXT_TO_DOM"; static final public String TEXT_TO_GREENSTONE = "TEXT_TO_GREENSTONE"; static final public String TEXT_TO_REGEXP = "TEXT_TO_REGEXP"; static final public String TEXT_TO_SHELL_UNIX = "TEXT_TO_SHELL_UNIX"; static final public String TEXT_TO_SHELL_WINDOWS = "TEXT_TO_SHELL_WINDOWS"; static final private int MAX_CACHE_SIZE = 100; static private HashMap TRANSFORMS; static private HashMap3D CACHE; /** Static function called to construct TRANSFORMS mappings */ static { TRANSFORMS = new HashMap(); String[] decode_path = { "\\|", "\\\\", "|", "\\|" }; TRANSFORMS.put(DECODE_PATH, decode_path); decode_path = null; // Transform text into text, but without [ and ] String[] decode_square_brackets = { "[", "\\[", "]", "\\]" }; TRANSFORMS.put(DECODE_SQUARE_BRACKETS, decode_square_brackets); decode_square_brackets = null; // Translate DOM encoded text into Greenstone encoding String[] dom_to_greenstone = { "'", "\\\\\'", ">", ">", "<", "<", """, "\\\\\"", "&", "&" }; // removed "\n", "\\\\n", - config files are allowed new lines // added "\\|", "\\\\" TRANSFORMS.put(DOM_TO_GREENSTONE, dom_to_greenstone); dom_to_greenstone = null; // Transform DOM encoded text into plain text String[] dom_to_text = { "&#091;", "\\[", "&#093;", "\\]", "'", "\'", ">", ">", "<", "<", """, "\"", "&", "&" }; TRANSFORMS.put(DOM_TO_TEXT, dom_to_text); dom_to_text = null; // Transform text into a regular expression that will match it String[] text_to_regexp = { "\\\\", "\\\\\\\\", "\\(", "\\\\(", "\\)", "\\\\)", "\\[", "\\\\[", "\\]", "\\\\]", "\\{", "\\\\{", "\\}", "\\\\}", "\\.", "\\\\." }; TRANSFORMS.put(TEXT_TO_REGEXP, text_to_regexp); text_to_regexp = null; String[] encode_path = { "\\|", "|", "\\\\", "\\|" }; TRANSFORMS.put(ENCODE_PATH, encode_path); encode_path = null; // Transform text into text, but without [ and ] String[] encode_square_brackets = { "\\[", "[", "\\]", "]" }; TRANSFORMS.put(ENCODE_SQUARE_BRACKETS, encode_square_brackets); encode_square_brackets = null; // Transform Greenstone encoded text to DOM encoding String[] greenstone_to_dom = { "&", "&", "<", "<", ">", ">", "\\\\\"", """, "\\\\\'", "'", "\"", """, "\'", "'" }; // removed"\\\\n", "\n", added "\\\\", "\\|" TRANSFORMS.put(GREENSTONE_TO_DOM, greenstone_to_dom); greenstone_to_dom = null; // Transform Greenstone encoded text to plain text String[] greenstone_to_text = { "\\\\\"", "\"", "\\\\\'", "\'", """, "\"", "'", "\'", "[", "\\[", "]", "\\]" }; // removed "\\\\n", "\n", "\\|", "\\\\" TRANSFORMS.put(GREENSTONE_TO_TEXT, greenstone_to_text); greenstone_to_text = null; // Transform plain html text into something that can be placed in a DOM String[] text_to_dom = { "&", "&", "<", "<", ">", ">", "\"", """, "\'", "'" }; TRANSFORMS.put(TEXT_TO_DOM, text_to_dom); text_to_dom = null; // Unescape html (or xml) text String[] escapedhtml_to_unescaped = { "&", "&", "<", "<", ">", ">", """, "\""//, //"'", "\'" }; TRANSFORMS.put(ESCAPEDHTML_TO_UNESCAPED, escapedhtml_to_unescaped); escapedhtml_to_unescaped = null; // Transform plain html text into greenstone encoding String[] text_to_greenstone = { "\\[", "[", "\\]", "]", "\"", """, "\n", "\\\\n" }; // "\'", "'", // removed "\\\\", "\\|", TRANSFORMS.put(TEXT_TO_GREENSTONE, text_to_greenstone); text_to_greenstone = null; // Transform plain html text into something that can be placed in a shell command String[] text_to_shell_unix = { "\"", "\\\\\"", "\'", "\\\\\'", "\n", "\\\\n" }; TRANSFORMS.put(TEXT_TO_SHELL_UNIX, text_to_shell_unix); text_to_shell_unix = null; // Transform plain html text into something that can be placed in a shell command. Windows requires twice as many escaped for speech marks to be passed to underlying processes String[] text_to_shell_windows = { "\"", "\\\\\\\\\\\\\"", "\'", "\\\\\'", "\n", "\\\\n" }; TRANSFORMS.put(TEXT_TO_SHELL_WINDOWS, text_to_shell_windows); text_to_shell_windows = null; CACHE = new HashMap3D(); } static public String transform(String raw, String transform) { if(raw == null) { return raw; } // System.err.println("Transforming by "+transform+":\n" + raw); String processed = (String) CACHE.get(transform, raw); if(processed == null) { processed = raw; String[] transforms = (String[]) TRANSFORMS.get(transform); if(transforms != null) { for(int i = 0; i < transforms.length; i = i + 2) { String target = transforms[i]; String result = transforms[i+1]; processed = processed.replaceAll(target, result); } } //DebugStream.println("\n*** Transform: " + transform + " ***"); //DebugStream.println("*** Raw : '" + raw + "'"); //DebugStream.println("*** Processed: '" + processed + "'"); // If cache is at maximum size, empty it and start again if(CACHE.size() == MAX_CACHE_SIZE) { CACHE.clear(); } CACHE.put(transform, raw, processed); } return processed; } /** Transform either of the accepted unicode escape sequences styles from in the string into single characters */ static final private char AND_CHAR = '&'; static final private char ESCAPE_CHAR = '\\'; static final private char HASH_CHAR = '#'; static final private char LOWER_U_CHAR = 'u'; static final private char UPPER_U_CHAR = 'U'; static final private char SEMICOLON_CHAR = ';'; static public String transformUnicode(String raw) { StringBuffer processed = new StringBuffer(); int index = 0; int raw_length = raw.length(); while(index < raw_length) { char c0 = raw.charAt(index); switch(c0) { case AND_CHAR: if(index + 1 < raw_length) { // First the HTML ç type char c1 = raw.charAt(index + 1); if(c1 == HASH_CHAR) { StringBuffer number_str = new StringBuffer(); char c2; int offset = 2; while(index + offset < raw_length && (c2 = raw.charAt(index + offset)) != SEMICOLON_CHAR) { number_str.append(c2); offset++; } // We've either run out of characters or have parsed a number if(index + offset < raw_length && raw.charAt(index + offset) == SEMICOLON_CHAR) { int number = Integer.parseInt(number_str.toString()); processed.append((char)number); index = index + offset; number_str = null; break; } number_str = null; } } processed.append(c0); break; case ESCAPE_CHAR: // Now the \u00e7 type if(index + 1 < raw_length) { char c3 = raw.charAt(index + 1); if((c3 == UPPER_U_CHAR || c3 == LOWER_U_CHAR) && index + 5 < raw_length) { // We read four digits String hex_str = raw.substring(index + 2, index + 6); int number = Integer.parseInt(hex_str, 16); hex_str = null; processed.append((char)number); index = index + 5; break; } } processed.append(c0); break; default: processed.append(c0); } index++; } return processed.toString(); } static public void main(String[] args) { if(args.length < 2) { String processed; String raw; String transform; System.err.println("Running Test Suite"); transform = "DOM_TO_GREENSTONE"; System.err.println("Test " + transform); raw = "A &lt;\nand a <a href="here.html"><font size='2'>URL</font></a>"; System.err.println("Raw: '" + raw + "'"); processed = transform(raw, transform); System.err.println("Processed: '" + processed + "'"); transform = "DOM_TO_TEXT"; System.err.println("Test " + transform); raw = "A &lt;\nand a <a href="here.html"><font size='2'>URL</font></a>"; System.err.println("Raw: '" + raw + "'"); processed = transform(raw, transform); System.err.println("Processed: '" + processed + "'"); transform = "GREENSTONE_TO_DOM"; System.err.println("Test " + transform); raw = "A <\\nand a "; System.err.println("Raw: '" + raw + "'"); processed = transform(raw, transform); System.err.println("Processed: '" + processed + "'"); transform = "GREENSTONE_TO_TEXT"; System.err.println("Test " + transform); raw = "These \\[ \\] should be escaped, and so should \\\\ that. These " ' \\n are encoded."; System.err.println("Raw: '" + raw + "'"); processed = transform(raw, transform); System.err.println("Processed: '" + processed + "'"); transform = "TEXT_TO_DOM"; System.err.println("Test " + transform); raw = "A <\nand a URL"; System.err.println("Raw: '" + raw + "'"); processed = transform(raw, transform); System.err.println("Processed: '" + processed + "'"); transform = "TEXT_TO_GREENSTONE"; System.err.println("Test " + transform); raw = "These [ ] should be escaped, and so should \\ that. These \" \' \n are encoded."; System.err.println("Raw: '" + raw + "'"); processed = transform(raw, transform); System.err.println("Processed: '" + processed + "'"); transform = "TEXT_TO_SHELL"; System.err.println("Test " + transform); if(Utility.isWindows()) { System.err.println("[Windows Version]"); transform = "TEXT_TO_SHELL_WINDOWS"; } else { System.err.println("[Unix Version]"); transform = "TEXT_TO_SHELL_UNIX"; } raw = "A <\nand a URL"; System.err.println("Raw: '" + raw + "'"); processed = transform(raw, transform); System.err.println("Processed: '" + processed + "'"); System.err.println("***** UNICODE TEST *****"); System.err.println("\\u0030 => " + transformUnicode("\\u0030")); System.err.println("\\u0041 => " + transformUnicode("\\u0041")); System.err.println("\\u007a => " + transformUnicode("\\u007a")); System.err.println("\\u00e7 => " + transformUnicode("\\u00e7")); System.err.println("0 => " + transformUnicode("0")); System.err.println("A => " + transformUnicode("A")); System.err.println("z => " + transformUnicode("z")); System.err.println("ç => " + transformUnicode("ç")); } else { System.err.println("Raw: '" + args[0] + "'"); System.err.println("Transform: " + args[1]); String processed = transform(args[0], args[1]); System.err.println("Processed: '" + processed + "'"); } } }