1 | /**
|
---|
2 | *#########################################################################
|
---|
3 | *
|
---|
4 | * A component of the Gatherer application, part of the Greenstone digital
|
---|
5 | * library suite from the New Zealand Digital Library Project at the
|
---|
6 | * University of Waikato, New Zealand.
|
---|
7 | *
|
---|
8 | * Author: John Thompson, Greenstone Digital Library, University of Waikato
|
---|
9 | *
|
---|
10 | * Copyright (C) 1999 New Zealand Digital Library Project
|
---|
11 | *
|
---|
12 | * This program is free software; you can redistribute it and/or modify
|
---|
13 | * it under the terms of the GNU General Public License as published by
|
---|
14 | * the Free Software Foundation; either version 2 of the License, or
|
---|
15 | * (at your option) any later version.
|
---|
16 | *
|
---|
17 | * This program is distributed in the hope that it will be useful,
|
---|
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
20 | * GNU General Public License for more details.
|
---|
21 | *
|
---|
22 | * You should have received a copy of the GNU General Public License
|
---|
23 | * along with this program; if not, write to the Free Software
|
---|
24 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
25 | *########################################################################
|
---|
26 | */
|
---|
27 | package org.greenstone.gatherer.util;
|
---|
28 |
|
---|
29 | import java.util.*;
|
---|
30 |
|
---|
31 | /** Provides a standard, extensible way to convert from one format of string to another (given that each format has differing requirements regarding legal characters and escaped characters)
|
---|
32 | * @author John Thompson, Greenstone Digital Library, University of Waikato
|
---|
33 | * @version 2.3d
|
---|
34 | */
|
---|
35 | public class Codec {
|
---|
36 |
|
---|
37 | static final public String DECODE_PATH = "DECODE_PATH";
|
---|
38 | static final public String DECODE_SQUARE_BRACKETS = "DECODE_SQUARE_BRACKETS";
|
---|
39 | static final public String DOM_TO_GREENSTONE = "DOM_TO_GREENSTONE";
|
---|
40 | static final public String DOM_TO_TEXT = "DOM_TO_TEXT";
|
---|
41 | static final public String ENCODE_PATH = "ENCODE_PATH";
|
---|
42 | static final public String ENCODE_SQUARE_BRACKETS = "ENCODE_SQUARE_BRACKETS";
|
---|
43 | static final public String ESCAPEDHTML_TO_UNESCAPED = "ESCAPEDHTML_TO_UNESCAPED";
|
---|
44 | static final public String REINSTATE_HTML_TAGS = "REINSTATE_HTML_TAGS";
|
---|
45 | static final public String GREENSTONE_TO_DOM = "GREENSTONE_TO_DOM";
|
---|
46 | static final public String GREENSTONE_TO_TEXT = "GREENSTONE_TO_TEXT";
|
---|
47 | static final public String TEXT_TO_DOM = "TEXT_TO_DOM";
|
---|
48 | static final public String TEXT_TO_DOM_PRESERVE_TAGS = "TEXT_TO_DOM_PRESERVE_TAGS";
|
---|
49 | static final public String TEXT_TO_GREENSTONE = "TEXT_TO_GREENSTONE";
|
---|
50 | static final public String TEXT_TO_REGEXP = "TEXT_TO_REGEXP";
|
---|
51 | static final public String TEXT_TO_SHELL_UNIX = "TEXT_TO_SHELL_UNIX";
|
---|
52 | static final public String TEXT_TO_SHELL_WINDOWS = "TEXT_TO_SHELL_WINDOWS";
|
---|
53 |
|
---|
54 | static final private int MAX_CACHE_SIZE = 100;
|
---|
55 |
|
---|
56 | static private HashMap TRANSFORMS;
|
---|
57 | static private HashMap3D CACHE;
|
---|
58 |
|
---|
59 | /** Static function called to construct TRANSFORMS mappings */
|
---|
60 | static {
|
---|
61 | TRANSFORMS = new HashMap();
|
---|
62 |
|
---|
63 | String[] decode_path = {
|
---|
64 | "\\|", "\\\\",
|
---|
65 | "|", "\\|"
|
---|
66 | };
|
---|
67 | TRANSFORMS.put(DECODE_PATH, decode_path);
|
---|
68 | decode_path = null;
|
---|
69 |
|
---|
70 | // Transform text into text, but without [ and ]
|
---|
71 | String[] decode_square_brackets = {
|
---|
72 | "[", "\\[",
|
---|
73 | "]", "\\]"
|
---|
74 | };
|
---|
75 | TRANSFORMS.put(DECODE_SQUARE_BRACKETS, decode_square_brackets);
|
---|
76 | decode_square_brackets = null;
|
---|
77 |
|
---|
78 | // Translate DOM encoded text into Greenstone encoding
|
---|
79 | String[] dom_to_greenstone = {
|
---|
80 | "'", "\\\\\'",
|
---|
81 | ">", ">",
|
---|
82 | "<", "<",
|
---|
83 | """, "\\\\\"",
|
---|
84 | "&", "&"
|
---|
85 | };
|
---|
86 | // removed "\n", "\\\\n", - config files are allowed new lines
|
---|
87 | // added "\\|", "\\\\"
|
---|
88 |
|
---|
89 | TRANSFORMS.put(DOM_TO_GREENSTONE, dom_to_greenstone);
|
---|
90 | dom_to_greenstone = null;
|
---|
91 |
|
---|
92 | // Transform DOM encoded text into plain text
|
---|
93 | String[] dom_to_text = {
|
---|
94 | "&#091;", "\\[",
|
---|
95 | "&#093;", "\\]",
|
---|
96 | "'", "\'",
|
---|
97 | ">", ">",
|
---|
98 | "<", "<",
|
---|
99 | """, "\"",
|
---|
100 | "&", "&"
|
---|
101 | };
|
---|
102 | TRANSFORMS.put(DOM_TO_TEXT, dom_to_text);
|
---|
103 | dom_to_text = null;
|
---|
104 |
|
---|
105 | // Transform text into a regular expression that will match it
|
---|
106 | String[] text_to_regexp = {
|
---|
107 | "\\\\", "\\\\\\\\",
|
---|
108 | "\\(", "\\\\(",
|
---|
109 | "\\)", "\\\\)",
|
---|
110 | "\\[", "\\\\[",
|
---|
111 | "\\]", "\\\\]",
|
---|
112 | "\\{", "\\\\{",
|
---|
113 | "\\}", "\\\\}",
|
---|
114 | "\\.", "\\\\."
|
---|
115 | };
|
---|
116 | TRANSFORMS.put(TEXT_TO_REGEXP, text_to_regexp);
|
---|
117 | text_to_regexp = null;
|
---|
118 |
|
---|
119 | String[] encode_path = {
|
---|
120 | "\\|", "|",
|
---|
121 | "\\\\", "\\|"
|
---|
122 | };
|
---|
123 | TRANSFORMS.put(ENCODE_PATH, encode_path);
|
---|
124 | encode_path = null;
|
---|
125 |
|
---|
126 | // Transform text into text, but without [ and ]
|
---|
127 | String[] encode_square_brackets = {
|
---|
128 | "\\[", "[",
|
---|
129 | "\\]", "]"
|
---|
130 | };
|
---|
131 | TRANSFORMS.put(ENCODE_SQUARE_BRACKETS, encode_square_brackets);
|
---|
132 | encode_square_brackets = null;
|
---|
133 |
|
---|
134 | // Transform Greenstone encoded text to DOM encoding
|
---|
135 | String[] greenstone_to_dom = {
|
---|
136 | "&", "&",
|
---|
137 | "<", "<",
|
---|
138 | ">", ">",
|
---|
139 | "\\\\\"", """,
|
---|
140 | "\\\\\'", "'",
|
---|
141 | "\"", """,
|
---|
142 | "\'", "'"
|
---|
143 | };
|
---|
144 | // removed"\\\\n", "\n", added "\\\\", "\\|"
|
---|
145 |
|
---|
146 | TRANSFORMS.put(GREENSTONE_TO_DOM, greenstone_to_dom);
|
---|
147 | greenstone_to_dom = null;
|
---|
148 |
|
---|
149 | // Transform Greenstone encoded text to plain text
|
---|
150 | String[] greenstone_to_text = {
|
---|
151 | "\\\\\"", "\"",
|
---|
152 | "\\\\\'", "\'",
|
---|
153 | """, "\"",
|
---|
154 | "'", "\'",
|
---|
155 | "[", "\\[",
|
---|
156 | "]", "\\]"
|
---|
157 | };
|
---|
158 | // removed "\\\\n", "\n", "\\|", "\\\\"
|
---|
159 |
|
---|
160 | TRANSFORMS.put(GREENSTONE_TO_TEXT, greenstone_to_text);
|
---|
161 | greenstone_to_text = null;
|
---|
162 |
|
---|
163 | // Transform plain html text into something that can be placed in a DOM
|
---|
164 | String[] text_to_dom = {
|
---|
165 | "&", "&",
|
---|
166 | "<", "<",
|
---|
167 | ">", ">",
|
---|
168 | "\"", """,
|
---|
169 | "\'", "'"
|
---|
170 | };
|
---|
171 | TRANSFORMS.put(TEXT_TO_DOM, text_to_dom);
|
---|
172 | text_to_dom = null;
|
---|
173 |
|
---|
174 | // Same as above, but preserve html element tags
|
---|
175 | String[] text_to_dom_preserve_tags = {
|
---|
176 | "&", "&",
|
---|
177 | "\"", """,
|
---|
178 | //"\'", "'"
|
---|
179 | };
|
---|
180 | TRANSFORMS.put(TEXT_TO_DOM_PRESERVE_TAGS, text_to_dom_preserve_tags);
|
---|
181 | text_to_dom_preserve_tags = null;
|
---|
182 |
|
---|
183 | // Unescape html (or xml) text
|
---|
184 | String[] escapedhtml_to_unescaped = {
|
---|
185 | "&", "&",
|
---|
186 | "<", "<",
|
---|
187 | ">", ">",
|
---|
188 | """, "\""//,
|
---|
189 | //"'", "\'"
|
---|
190 | };
|
---|
191 | TRANSFORMS.put(ESCAPEDHTML_TO_UNESCAPED, escapedhtml_to_unescaped);
|
---|
192 | escapedhtml_to_unescaped = null;
|
---|
193 |
|
---|
194 | // Reinstate tag markers <>
|
---|
195 | String[] reinstate_html_tags = {
|
---|
196 | "<", "<",
|
---|
197 | ">", ">",
|
---|
198 | };
|
---|
199 | TRANSFORMS.put(REINSTATE_HTML_TAGS, reinstate_html_tags);
|
---|
200 | reinstate_html_tags = null;
|
---|
201 |
|
---|
202 |
|
---|
203 | // Transform plain html text into greenstone encoding
|
---|
204 | String[] text_to_greenstone = {
|
---|
205 |
|
---|
206 | "\\[", "[",
|
---|
207 | "\\]", "]",
|
---|
208 | "\"", """,
|
---|
209 | "\n", "\\\\n"
|
---|
210 | };
|
---|
211 | // "\'", "'",
|
---|
212 | // removed "\\\\", "\\|",
|
---|
213 | TRANSFORMS.put(TEXT_TO_GREENSTONE, text_to_greenstone);
|
---|
214 | text_to_greenstone = null;
|
---|
215 |
|
---|
216 | // Transform plain html text into something that can be placed in a shell command
|
---|
217 | String[] text_to_shell_unix = {
|
---|
218 | "\"", "\\\\\"",
|
---|
219 | "\'", "\\\\\'",
|
---|
220 | "\n", "\\\\n"
|
---|
221 | };
|
---|
222 | TRANSFORMS.put(TEXT_TO_SHELL_UNIX, text_to_shell_unix);
|
---|
223 | text_to_shell_unix = null;
|
---|
224 |
|
---|
225 | // Transform plain html text into something that can be placed in a shell command. Windows requires twice as many escaped for speech marks to be passed to underlying processes
|
---|
226 | String[] text_to_shell_windows = {
|
---|
227 | "\"", "\\\\\\\\\\\\\"",
|
---|
228 | "\'", "\\\\\'",
|
---|
229 | "\n", "\\\\n"
|
---|
230 | };
|
---|
231 | TRANSFORMS.put(TEXT_TO_SHELL_WINDOWS, text_to_shell_windows);
|
---|
232 | text_to_shell_windows = null;
|
---|
233 |
|
---|
234 | CACHE = new HashMap3D();
|
---|
235 | }
|
---|
236 |
|
---|
237 | static public String transform(String raw, String transform) {
|
---|
238 | if(raw == null) {
|
---|
239 | return raw;
|
---|
240 | }
|
---|
241 | // System.err.println("Transforming by "+transform+":\n" + raw);
|
---|
242 | String processed = (String) CACHE.get(transform, raw);
|
---|
243 | if(processed == null) {
|
---|
244 | processed = raw;
|
---|
245 | String[] transforms = (String[]) TRANSFORMS.get(transform);
|
---|
246 | if(transforms != null) {
|
---|
247 | for(int i = 0; i < transforms.length; i = i + 2) {
|
---|
248 | String target = transforms[i];
|
---|
249 | String result = transforms[i+1];
|
---|
250 | processed = processed.replaceAll(target, result);
|
---|
251 | }
|
---|
252 | }
|
---|
253 | //DebugStream.println("\n*** Transform: " + transform + " ***");
|
---|
254 | //DebugStream.println("*** Raw : '" + raw + "'");
|
---|
255 | //DebugStream.println("*** Processed: '" + processed + "'");
|
---|
256 | // If cache is at maximum size, empty it and start again
|
---|
257 | if(CACHE.size() == MAX_CACHE_SIZE) {
|
---|
258 | CACHE.clear();
|
---|
259 | }
|
---|
260 | CACHE.put(transform, raw, processed);
|
---|
261 | }
|
---|
262 | return processed;
|
---|
263 | }
|
---|
264 |
|
---|
265 | /** Transform either of the accepted unicode escape sequences styles from in the string into single characters */
|
---|
266 | static final private char AND_CHAR = '&';
|
---|
267 | static final private char ESCAPE_CHAR = '\\';
|
---|
268 | static final private char HASH_CHAR = '#';
|
---|
269 | static final private char LOWER_U_CHAR = 'u';
|
---|
270 | static final private char UPPER_U_CHAR = 'U';
|
---|
271 | static final private char SEMICOLON_CHAR = ';';
|
---|
272 |
|
---|
273 | static public String transformUnicode(String raw) {
|
---|
274 | StringBuffer processed = new StringBuffer();
|
---|
275 | int index = 0;
|
---|
276 | int raw_length = raw.length();
|
---|
277 | while(index < raw_length) {
|
---|
278 | char c0 = raw.charAt(index);
|
---|
279 | switch(c0) {
|
---|
280 | case AND_CHAR:
|
---|
281 | if(index + 1 < raw_length) {
|
---|
282 | // First the HTML ç type
|
---|
283 | char c1 = raw.charAt(index + 1);
|
---|
284 | if(c1 == HASH_CHAR) {
|
---|
285 | StringBuffer number_str = new StringBuffer();
|
---|
286 | char c2;
|
---|
287 | int offset = 2;
|
---|
288 | while(index + offset < raw_length && (c2 = raw.charAt(index + offset)) != SEMICOLON_CHAR) {
|
---|
289 | number_str.append(c2);
|
---|
290 | offset++;
|
---|
291 | }
|
---|
292 | // We've either run out of characters or have parsed a number
|
---|
293 | if(index + offset < raw_length && raw.charAt(index + offset) == SEMICOLON_CHAR) {
|
---|
294 | int number = Integer.parseInt(number_str.toString());
|
---|
295 | processed.append((char)number);
|
---|
296 | index = index + offset;
|
---|
297 | number_str = null;
|
---|
298 | break;
|
---|
299 | }
|
---|
300 | number_str = null;
|
---|
301 | }
|
---|
302 | }
|
---|
303 | processed.append(c0);
|
---|
304 | break;
|
---|
305 | case ESCAPE_CHAR:
|
---|
306 | // Now the \u00e7 type
|
---|
307 | if(index + 1 < raw_length) {
|
---|
308 | char c3 = raw.charAt(index + 1);
|
---|
309 | if((c3 == UPPER_U_CHAR || c3 == LOWER_U_CHAR) && index + 5 < raw_length) {
|
---|
310 | // We read four digits
|
---|
311 | String hex_str = raw.substring(index + 2, index + 6);
|
---|
312 | int number = Integer.parseInt(hex_str, 16);
|
---|
313 | hex_str = null;
|
---|
314 | processed.append((char)number);
|
---|
315 | index = index + 5;
|
---|
316 | break;
|
---|
317 | }
|
---|
318 | }
|
---|
319 | processed.append(c0);
|
---|
320 | break;
|
---|
321 | default:
|
---|
322 | processed.append(c0);
|
---|
323 | }
|
---|
324 | index++;
|
---|
325 | }
|
---|
326 | return processed.toString();
|
---|
327 | }
|
---|
328 |
|
---|
329 | static public void main(String[] args) {
|
---|
330 | if(args.length < 2) {
|
---|
331 | String processed;
|
---|
332 | String raw;
|
---|
333 | String transform;
|
---|
334 |
|
---|
335 | System.err.println("Running Test Suite");
|
---|
336 |
|
---|
337 | transform = "DOM_TO_GREENSTONE";
|
---|
338 | System.err.println("Test " + transform);
|
---|
339 | raw = "A &lt;\nand a <a href="here.html"><font size='2'>URL</font></a>";
|
---|
340 | System.err.println("Raw: '" + raw + "'");
|
---|
341 | processed = transform(raw, transform);
|
---|
342 | System.err.println("Processed: '" + processed + "'");
|
---|
343 |
|
---|
344 | transform = "DOM_TO_TEXT";
|
---|
345 | System.err.println("Test " + transform);
|
---|
346 | raw = "A &lt;\nand a <a href="here.html"><font size='2'>URL</font></a>";
|
---|
347 | System.err.println("Raw: '" + raw + "'");
|
---|
348 | processed = transform(raw, transform);
|
---|
349 | System.err.println("Processed: '" + processed + "'");
|
---|
350 |
|
---|
351 | transform = "GREENSTONE_TO_DOM";
|
---|
352 | System.err.println("Test " + transform);
|
---|
353 | raw = "A <\\nand a <a href=\\\"here.html\\\"><font size=\\\'2\\\'URL</font></a>";
|
---|
354 | System.err.println("Raw: '" + raw + "'");
|
---|
355 | processed = transform(raw, transform);
|
---|
356 | System.err.println("Processed: '" + processed + "'");
|
---|
357 |
|
---|
358 | transform = "GREENSTONE_TO_TEXT";
|
---|
359 | System.err.println("Test " + transform);
|
---|
360 | raw = "These \\[ \\] should be escaped, and so should \\\\ that. These " ' \\n are encoded.";
|
---|
361 | System.err.println("Raw: '" + raw + "'");
|
---|
362 | processed = transform(raw, transform);
|
---|
363 | System.err.println("Processed: '" + processed + "'");
|
---|
364 |
|
---|
365 | transform = "TEXT_TO_DOM";
|
---|
366 | System.err.println("Test " + transform);
|
---|
367 | raw = "A <\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
|
---|
368 | System.err.println("Raw: '" + raw + "'");
|
---|
369 | processed = transform(raw, transform);
|
---|
370 | System.err.println("Processed: '" + processed + "'");
|
---|
371 |
|
---|
372 | transform = "TEXT_TO_GREENSTONE";
|
---|
373 | System.err.println("Test " + transform);
|
---|
374 | raw = "These [ ] should be escaped, and so should \\ that. These \" \' \n are encoded.";
|
---|
375 | System.err.println("Raw: '" + raw + "'");
|
---|
376 | processed = transform(raw, transform);
|
---|
377 | System.err.println("Processed: '" + processed + "'");
|
---|
378 |
|
---|
379 | transform = "TEXT_TO_SHELL";
|
---|
380 | System.err.println("Test " + transform);
|
---|
381 | if(Utility.isWindows()) {
|
---|
382 | System.err.println("[Windows Version]");
|
---|
383 | transform = "TEXT_TO_SHELL_WINDOWS";
|
---|
384 | }
|
---|
385 | else {
|
---|
386 | System.err.println("[Unix Version]");
|
---|
387 | transform = "TEXT_TO_SHELL_UNIX";
|
---|
388 | }
|
---|
389 | raw = "A <\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
|
---|
390 | System.err.println("Raw: '" + raw + "'");
|
---|
391 | processed = transform(raw, transform);
|
---|
392 | System.err.println("Processed: '" + processed + "'");
|
---|
393 |
|
---|
394 | System.err.println("***** UNICODE TEST *****");
|
---|
395 | System.err.println("\\u0030 => " + transformUnicode("\\u0030"));
|
---|
396 | System.err.println("\\u0041 => " + transformUnicode("\\u0041"));
|
---|
397 | System.err.println("\\u007a => " + transformUnicode("\\u007a"));
|
---|
398 | System.err.println("\\u00e7 => " + transformUnicode("\\u00e7"));
|
---|
399 | System.err.println("0 => " + transformUnicode("0"));
|
---|
400 | System.err.println("A => " + transformUnicode("A"));
|
---|
401 | System.err.println("z => " + transformUnicode("z"));
|
---|
402 | System.err.println("ç => " + transformUnicode("ç"));
|
---|
403 | }
|
---|
404 | else {
|
---|
405 | System.err.println("Raw: '" + args[0] + "'");
|
---|
406 | System.err.println("Transform: " + args[1]);
|
---|
407 | String processed = transform(args[0], args[1]);
|
---|
408 | System.err.println("Processed: '" + processed + "'");
|
---|
409 | }
|
---|
410 | }
|
---|
411 | }
|
---|