[33053] | 1 | /**
|
---|
| 2 | *#########################################################################
|
---|
| 3 | *
|
---|
| 4 | * A component of the Gatherer application, part of the Greenstone digital
|
---|
| 5 | * library suite from the New Zealand Digital Library Project at the
|
---|
| 6 | * University of Waikato, New Zealand.
|
---|
| 7 | *
|
---|
| 8 | * Author: John Thompson, Greenstone Digital Library, University of Waikato
|
---|
| 9 | *
|
---|
| 10 | * Copyright (C) 1999 New Zealand Digital Library Project
|
---|
| 11 | *
|
---|
| 12 | * This program is free software; you can redistribute it and/or modify
|
---|
| 13 | * it under the terms of the GNU General Public License as published by
|
---|
| 14 | * the Free Software Foundation; either version 2 of the License, or
|
---|
| 15 | * (at your option) any later version.
|
---|
| 16 | *
|
---|
| 17 | * This program is distributed in the hope that it will be useful,
|
---|
| 18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 20 | * GNU General Public License for more details.
|
---|
| 21 | *
|
---|
| 22 | * You should have received a copy of the GNU General Public License
|
---|
| 23 | * along with this program; if not, write to the Free Software
|
---|
| 24 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 25 | *########################################################################
|
---|
| 26 | */
|
---|
| 27 | package org.greenstone.gatherer.util;
|
---|
| 28 |
|
---|
| 29 | import java.util.*;
|
---|
| 30 |
|
---|
| 31 | /** Provides a standard, extensible way to convert from one format of string to another (given that each format has differing requirements regarding legal characters and escaped characters)
|
---|
| 32 | * @author John Thompson, Greenstone Digital Library, University of Waikato
|
---|
| 33 | * @version 2.3d
|
---|
| 34 | */
|
---|
| 35 | public class Codec {
|
---|
| 36 |
|
---|
| 37 | static final public String DECODE_PATH = "DECODE_PATH";
|
---|
| 38 | static final public String DECODE_SQUARE_BRACKETS = "DECODE_SQUARE_BRACKETS";
|
---|
| 39 | static final public String DOM_TO_GREENSTONE = "DOM_TO_GREENSTONE";
|
---|
| 40 | static final public String DOM_TO_TEXT = "DOM_TO_TEXT";
|
---|
| 41 | static final public String ENCODE_PATH = "ENCODE_PATH";
|
---|
| 42 | static final public String ENCODE_SQUARE_BRACKETS = "ENCODE_SQUARE_BRACKETS";
|
---|
| 43 | static final public String ESCAPEDHTML_TO_UNESCAPED = "ESCAPEDHTML_TO_UNESCAPED";
|
---|
| 44 | static final public String GREENSTONE_TO_DOM = "GREENSTONE_TO_DOM";
|
---|
| 45 | static final public String GREENSTONE_TO_TEXT = "GREENSTONE_TO_TEXT";
|
---|
| 46 | static final public String TEXT_TO_DOM = "TEXT_TO_DOM";
|
---|
| 47 | static final public String TEXT_TO_GREENSTONE = "TEXT_TO_GREENSTONE";
|
---|
| 48 | static final public String TEXT_TO_REGEXP = "TEXT_TO_REGEXP";
|
---|
| 49 | static final public String TEXT_TO_SHELL_UNIX = "TEXT_TO_SHELL_UNIX";
|
---|
| 50 | static final public String TEXT_TO_SHELL_WINDOWS = "TEXT_TO_SHELL_WINDOWS";
|
---|
| 51 |
|
---|
| 52 | static final private int MAX_CACHE_SIZE = 100;
|
---|
| 53 |
|
---|
| 54 | static private HashMap TRANSFORMS;
|
---|
| 55 | static private HashMap3D CACHE;
|
---|
| 56 |
|
---|
| 57 | /** Static function called to construct TRANSFORMS mappings */
|
---|
| 58 | static {
|
---|
| 59 | TRANSFORMS = new HashMap();
|
---|
| 60 |
|
---|
| 61 | String[] decode_path = {
|
---|
| 62 | "\\|", "\\\\",
|
---|
| 63 | "|", "\\|"
|
---|
| 64 | };
|
---|
| 65 | TRANSFORMS.put(DECODE_PATH, decode_path);
|
---|
| 66 | decode_path = null;
|
---|
| 67 |
|
---|
| 68 | // Transform text into text, but without [ and ]
|
---|
| 69 | String[] decode_square_brackets = {
|
---|
| 70 | "[", "\\[",
|
---|
| 71 | "]", "\\]"
|
---|
| 72 | };
|
---|
| 73 | TRANSFORMS.put(DECODE_SQUARE_BRACKETS, decode_square_brackets);
|
---|
| 74 | decode_square_brackets = null;
|
---|
| 75 |
|
---|
| 76 | // Translate DOM encoded text into Greenstone encoding
|
---|
| 77 | String[] dom_to_greenstone = {
|
---|
| 78 | "'", "\\\\\'",
|
---|
| 79 | ">", ">",
|
---|
| 80 | "<", "<",
|
---|
| 81 | """, "\\\\\"",
|
---|
| 82 | "&", "&"
|
---|
| 83 | };
|
---|
| 84 | // removed "\n", "\\\\n", - config files are allowed new lines
|
---|
| 85 | // added "\\|", "\\\\"
|
---|
| 86 |
|
---|
| 87 | TRANSFORMS.put(DOM_TO_GREENSTONE, dom_to_greenstone);
|
---|
| 88 | dom_to_greenstone = null;
|
---|
| 89 |
|
---|
| 90 | // Transform DOM encoded text into plain text
|
---|
| 91 | String[] dom_to_text = {
|
---|
| 92 | "&#091;", "\\[",
|
---|
| 93 | "&#093;", "\\]",
|
---|
| 94 | "'", "\'",
|
---|
| 95 | ">", ">",
|
---|
| 96 | "<", "<",
|
---|
| 97 | """, "\"",
|
---|
| 98 | "&", "&"
|
---|
| 99 | };
|
---|
| 100 | TRANSFORMS.put(DOM_TO_TEXT, dom_to_text);
|
---|
| 101 | dom_to_text = null;
|
---|
| 102 |
|
---|
| 103 | // Transform text into a regular expression that will match it
|
---|
| 104 | String[] text_to_regexp = {
|
---|
| 105 | "\\\\", "\\\\\\\\",
|
---|
| 106 | "\\(", "\\\\(",
|
---|
| 107 | "\\)", "\\\\)",
|
---|
| 108 | "\\[", "\\\\[",
|
---|
| 109 | "\\]", "\\\\]",
|
---|
| 110 | "\\{", "\\\\{",
|
---|
| 111 | "\\}", "\\\\}",
|
---|
| 112 | "\\.", "\\\\."
|
---|
| 113 | };
|
---|
| 114 | TRANSFORMS.put(TEXT_TO_REGEXP, text_to_regexp);
|
---|
| 115 | text_to_regexp = null;
|
---|
| 116 |
|
---|
| 117 | String[] encode_path = {
|
---|
| 118 | "\\|", "|",
|
---|
| 119 | "\\\\", "\\|"
|
---|
| 120 | };
|
---|
| 121 | TRANSFORMS.put(ENCODE_PATH, encode_path);
|
---|
| 122 | encode_path = null;
|
---|
| 123 |
|
---|
| 124 | // Transform text into text, but without [ and ]
|
---|
| 125 | String[] encode_square_brackets = {
|
---|
| 126 | "\\[", "[",
|
---|
| 127 | "\\]", "]"
|
---|
| 128 | };
|
---|
| 129 | TRANSFORMS.put(ENCODE_SQUARE_BRACKETS, encode_square_brackets);
|
---|
| 130 | encode_square_brackets = null;
|
---|
| 131 |
|
---|
| 132 | // Transform Greenstone encoded text to DOM encoding
|
---|
| 133 | String[] greenstone_to_dom = {
|
---|
| 134 | "&", "&",
|
---|
| 135 | "<", "<",
|
---|
| 136 | ">", ">",
|
---|
| 137 | "\\\\\"", """,
|
---|
| 138 | "\\\\\'", "'",
|
---|
| 139 | "\"", """,
|
---|
| 140 | "\'", "'"
|
---|
| 141 | };
|
---|
| 142 | // removed"\\\\n", "\n", added "\\\\", "\\|"
|
---|
| 143 |
|
---|
| 144 | TRANSFORMS.put(GREENSTONE_TO_DOM, greenstone_to_dom);
|
---|
| 145 | greenstone_to_dom = null;
|
---|
| 146 |
|
---|
| 147 | // Transform Greenstone encoded text to plain text
|
---|
| 148 | String[] greenstone_to_text = {
|
---|
| 149 | "\\\\\"", "\"",
|
---|
| 150 | "\\\\\'", "\'",
|
---|
| 151 | """, "\"",
|
---|
| 152 | "'", "\'",
|
---|
| 153 | "[", "\\[",
|
---|
| 154 | "]", "\\]"
|
---|
| 155 | };
|
---|
| 156 | // removed "\\\\n", "\n", "\\|", "\\\\"
|
---|
| 157 |
|
---|
| 158 | TRANSFORMS.put(GREENSTONE_TO_TEXT, greenstone_to_text);
|
---|
| 159 | greenstone_to_text = null;
|
---|
| 160 |
|
---|
| 161 | // Transform plain html text into something that can be placed in a DOM
|
---|
| 162 | String[] text_to_dom = {
|
---|
| 163 | "&", "&",
|
---|
| 164 | "<", "<",
|
---|
| 165 | ">", ">",
|
---|
| 166 | "\"", """,
|
---|
| 167 | "\'", "'"
|
---|
| 168 | };
|
---|
| 169 | TRANSFORMS.put(TEXT_TO_DOM, text_to_dom);
|
---|
| 170 | text_to_dom = null;
|
---|
| 171 |
|
---|
| 172 | // Unescape html (or xml) text
|
---|
| 173 | String[] escapedhtml_to_unescaped = {
|
---|
| 174 | "&", "&",
|
---|
| 175 | "<", "<",
|
---|
| 176 | ">", ">",
|
---|
| 177 | """, "\""//,
|
---|
| 178 | //"'", "\'"
|
---|
| 179 | };
|
---|
| 180 | TRANSFORMS.put(ESCAPEDHTML_TO_UNESCAPED, escapedhtml_to_unescaped);
|
---|
| 181 | escapedhtml_to_unescaped = null;
|
---|
| 182 |
|
---|
| 183 | // Transform plain html text into greenstone encoding
|
---|
| 184 | String[] text_to_greenstone = {
|
---|
| 185 |
|
---|
| 186 | "\\[", "[",
|
---|
| 187 | "\\]", "]",
|
---|
| 188 | "\"", """,
|
---|
| 189 | "\n", "\\\\n"
|
---|
| 190 | };
|
---|
| 191 | // "\'", "'",
|
---|
| 192 | // removed "\\\\", "\\|",
|
---|
| 193 | TRANSFORMS.put(TEXT_TO_GREENSTONE, text_to_greenstone);
|
---|
| 194 | text_to_greenstone = null;
|
---|
| 195 |
|
---|
| 196 | // Transform plain html text into something that can be placed in a shell command
|
---|
| 197 | String[] text_to_shell_unix = {
|
---|
| 198 | "\"", "\\\\\"",
|
---|
| 199 | "\'", "\\\\\'",
|
---|
| 200 | "\n", "\\\\n"
|
---|
| 201 | };
|
---|
| 202 | TRANSFORMS.put(TEXT_TO_SHELL_UNIX, text_to_shell_unix);
|
---|
| 203 | text_to_shell_unix = null;
|
---|
| 204 |
|
---|
| 205 | // Transform plain html text into something that can be placed in a shell command. Windows requires twice as many escaped for speech marks to be passed to underlying processes
|
---|
| 206 | String[] text_to_shell_windows = {
|
---|
| 207 | "\"", "\\\\\\\\\\\\\"",
|
---|
| 208 | "\'", "\\\\\'",
|
---|
| 209 | "\n", "\\\\n"
|
---|
| 210 | };
|
---|
| 211 | TRANSFORMS.put(TEXT_TO_SHELL_WINDOWS, text_to_shell_windows);
|
---|
| 212 | text_to_shell_windows = null;
|
---|
| 213 |
|
---|
| 214 | CACHE = new HashMap3D();
|
---|
| 215 | }
|
---|
| 216 |
|
---|
| 217 | static public String transform(String raw, String transform) {
|
---|
| 218 | if(raw == null) {
|
---|
| 219 | return raw;
|
---|
| 220 | }
|
---|
| 221 | // System.err.println("Transforming by "+transform+":\n" + raw);
|
---|
| 222 | String processed = (String) CACHE.get(transform, raw);
|
---|
| 223 | if(processed == null) {
|
---|
| 224 | processed = raw;
|
---|
| 225 | String[] transforms = (String[]) TRANSFORMS.get(transform);
|
---|
| 226 | if(transforms != null) {
|
---|
| 227 | for(int i = 0; i < transforms.length; i = i + 2) {
|
---|
| 228 | String target = transforms[i];
|
---|
| 229 | String result = transforms[i+1];
|
---|
| 230 | processed = processed.replaceAll(target, result);
|
---|
| 231 | }
|
---|
| 232 | }
|
---|
| 233 | //DebugStream.println("\n*** Transform: " + transform + " ***");
|
---|
| 234 | //DebugStream.println("*** Raw : '" + raw + "'");
|
---|
| 235 | //DebugStream.println("*** Processed: '" + processed + "'");
|
---|
| 236 | // If cache is at maximum size, empty it and start again
|
---|
| 237 | if(CACHE.size() == MAX_CACHE_SIZE) {
|
---|
| 238 | CACHE.clear();
|
---|
| 239 | }
|
---|
| 240 | CACHE.put(transform, raw, processed);
|
---|
| 241 | }
|
---|
| 242 | return processed;
|
---|
| 243 | }
|
---|
| 244 |
|
---|
| 245 | /** Transform either of the accepted unicode escape sequences styles from in the string into single characters */
|
---|
| 246 | static final private char AND_CHAR = '&';
|
---|
| 247 | static final private char ESCAPE_CHAR = '\\';
|
---|
| 248 | static final private char HASH_CHAR = '#';
|
---|
| 249 | static final private char LOWER_U_CHAR = 'u';
|
---|
| 250 | static final private char UPPER_U_CHAR = 'U';
|
---|
| 251 | static final private char SEMICOLON_CHAR = ';';
|
---|
| 252 |
|
---|
| 253 | static public String transformUnicode(String raw) {
|
---|
| 254 | StringBuffer processed = new StringBuffer();
|
---|
| 255 | int index = 0;
|
---|
| 256 | int raw_length = raw.length();
|
---|
| 257 | while(index < raw_length) {
|
---|
| 258 | char c0 = raw.charAt(index);
|
---|
| 259 | switch(c0) {
|
---|
| 260 | case AND_CHAR:
|
---|
| 261 | if(index + 1 < raw_length) {
|
---|
| 262 | // First the HTML ç type
|
---|
| 263 | char c1 = raw.charAt(index + 1);
|
---|
| 264 | if(c1 == HASH_CHAR) {
|
---|
| 265 | StringBuffer number_str = new StringBuffer();
|
---|
| 266 | char c2;
|
---|
| 267 | int offset = 2;
|
---|
| 268 | while(index + offset < raw_length && (c2 = raw.charAt(index + offset)) != SEMICOLON_CHAR) {
|
---|
| 269 | number_str.append(c2);
|
---|
| 270 | offset++;
|
---|
| 271 | }
|
---|
| 272 | // We've either run out of characters or have parsed a number
|
---|
| 273 | if(index + offset < raw_length && raw.charAt(index + offset) == SEMICOLON_CHAR) {
|
---|
| 274 | int number = Integer.parseInt(number_str.toString());
|
---|
| 275 | processed.append((char)number);
|
---|
| 276 | index = index + offset;
|
---|
| 277 | number_str = null;
|
---|
| 278 | break;
|
---|
| 279 | }
|
---|
| 280 | number_str = null;
|
---|
| 281 | }
|
---|
| 282 | }
|
---|
| 283 | processed.append(c0);
|
---|
| 284 | break;
|
---|
| 285 | case ESCAPE_CHAR:
|
---|
| 286 | // Now the \u00e7 type
|
---|
| 287 | if(index + 1 < raw_length) {
|
---|
| 288 | char c3 = raw.charAt(index + 1);
|
---|
| 289 | if((c3 == UPPER_U_CHAR || c3 == LOWER_U_CHAR) && index + 5 < raw_length) {
|
---|
| 290 | // We read four digits
|
---|
| 291 | String hex_str = raw.substring(index + 2, index + 6);
|
---|
| 292 | int number = Integer.parseInt(hex_str, 16);
|
---|
| 293 | hex_str = null;
|
---|
| 294 | processed.append((char)number);
|
---|
| 295 | index = index + 5;
|
---|
| 296 | break;
|
---|
| 297 | }
|
---|
| 298 | }
|
---|
| 299 | processed.append(c0);
|
---|
| 300 | break;
|
---|
| 301 | default:
|
---|
| 302 | processed.append(c0);
|
---|
| 303 | }
|
---|
| 304 | index++;
|
---|
| 305 | }
|
---|
| 306 | return processed.toString();
|
---|
| 307 | }
|
---|
| 308 |
|
---|
| 309 | static public void main(String[] args) {
|
---|
| 310 | if(args.length < 2) {
|
---|
| 311 | String processed;
|
---|
| 312 | String raw;
|
---|
| 313 | String transform;
|
---|
| 314 |
|
---|
| 315 | System.err.println("Running Test Suite");
|
---|
| 316 |
|
---|
| 317 | transform = "DOM_TO_GREENSTONE";
|
---|
| 318 | System.err.println("Test " + transform);
|
---|
| 319 | raw = "A &lt;\nand a <a href="here.html"><font size='2'>URL</font></a>";
|
---|
| 320 | System.err.println("Raw: '" + raw + "'");
|
---|
| 321 | processed = transform(raw, transform);
|
---|
| 322 | System.err.println("Processed: '" + processed + "'");
|
---|
| 323 |
|
---|
| 324 | transform = "DOM_TO_TEXT";
|
---|
| 325 | System.err.println("Test " + transform);
|
---|
| 326 | raw = "A &lt;\nand a <a href="here.html"><font size='2'>URL</font></a>";
|
---|
| 327 | System.err.println("Raw: '" + raw + "'");
|
---|
| 328 | processed = transform(raw, transform);
|
---|
| 329 | System.err.println("Processed: '" + processed + "'");
|
---|
| 330 |
|
---|
| 331 | transform = "GREENSTONE_TO_DOM";
|
---|
| 332 | System.err.println("Test " + transform);
|
---|
| 333 | raw = "A <\\nand a <a href=\\\"here.html\\\"><font size=\\\'2\\\'URL</font></a>";
|
---|
| 334 | System.err.println("Raw: '" + raw + "'");
|
---|
| 335 | processed = transform(raw, transform);
|
---|
| 336 | System.err.println("Processed: '" + processed + "'");
|
---|
| 337 |
|
---|
| 338 | transform = "GREENSTONE_TO_TEXT";
|
---|
| 339 | System.err.println("Test " + transform);
|
---|
| 340 | raw = "These \\[ \\] should be escaped, and so should \\\\ that. These " ' \\n are encoded.";
|
---|
| 341 | System.err.println("Raw: '" + raw + "'");
|
---|
| 342 | processed = transform(raw, transform);
|
---|
| 343 | System.err.println("Processed: '" + processed + "'");
|
---|
| 344 |
|
---|
| 345 | transform = "TEXT_TO_DOM";
|
---|
| 346 | System.err.println("Test " + transform);
|
---|
| 347 | raw = "A <\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
|
---|
| 348 | System.err.println("Raw: '" + raw + "'");
|
---|
| 349 | processed = transform(raw, transform);
|
---|
| 350 | System.err.println("Processed: '" + processed + "'");
|
---|
| 351 |
|
---|
| 352 | transform = "TEXT_TO_GREENSTONE";
|
---|
| 353 | System.err.println("Test " + transform);
|
---|
| 354 | raw = "These [ ] should be escaped, and so should \\ that. These \" \' \n are encoded.";
|
---|
| 355 | System.err.println("Raw: '" + raw + "'");
|
---|
| 356 | processed = transform(raw, transform);
|
---|
| 357 | System.err.println("Processed: '" + processed + "'");
|
---|
| 358 |
|
---|
| 359 | transform = "TEXT_TO_SHELL";
|
---|
| 360 | System.err.println("Test " + transform);
|
---|
| 361 | if(Utility.isWindows()) {
|
---|
| 362 | System.err.println("[Windows Version]");
|
---|
| 363 | transform = "TEXT_TO_SHELL_WINDOWS";
|
---|
| 364 | }
|
---|
| 365 | else {
|
---|
| 366 | System.err.println("[Unix Version]");
|
---|
| 367 | transform = "TEXT_TO_SHELL_UNIX";
|
---|
| 368 | }
|
---|
| 369 | raw = "A <\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
|
---|
| 370 | System.err.println("Raw: '" + raw + "'");
|
---|
| 371 | processed = transform(raw, transform);
|
---|
| 372 | System.err.println("Processed: '" + processed + "'");
|
---|
| 373 |
|
---|
| 374 | System.err.println("***** UNICODE TEST *****");
|
---|
| 375 | System.err.println("\\u0030 => " + transformUnicode("\\u0030"));
|
---|
| 376 | System.err.println("\\u0041 => " + transformUnicode("\\u0041"));
|
---|
| 377 | System.err.println("\\u007a => " + transformUnicode("\\u007a"));
|
---|
| 378 | System.err.println("\\u00e7 => " + transformUnicode("\\u00e7"));
|
---|
| 379 | System.err.println("0 => " + transformUnicode("0"));
|
---|
| 380 | System.err.println("A => " + transformUnicode("A"));
|
---|
| 381 | System.err.println("z => " + transformUnicode("z"));
|
---|
| 382 | System.err.println("ç => " + transformUnicode("ç"));
|
---|
| 383 | }
|
---|
| 384 | else {
|
---|
| 385 | System.err.println("Raw: '" + args[0] + "'");
|
---|
| 386 | System.err.println("Transform: " + args[1]);
|
---|
| 387 | String processed = transform(args[0], args[1]);
|
---|
| 388 | System.err.println("Processed: '" + processed + "'");
|
---|
| 389 | }
|
---|
| 390 | }
|
---|
| 391 | }
|
---|