1 | /**
|
---|
2 | *#########################################################################
|
---|
3 | *
|
---|
4 | * A component of the Gatherer application, part of the Greenstone digital
|
---|
5 | * library suite from the New Zealand Digital Library Project at the
|
---|
6 | * University of Waikato, New Zealand.
|
---|
7 | *
|
---|
8 | * Author: John Thompson, Greenstone Digital Library, University of Waikato
|
---|
9 | *
|
---|
10 | * Copyright (C) 1999 New Zealand Digital Library Project
|
---|
11 | *
|
---|
12 | * This program is free software; you can redistribute it and/or modify
|
---|
13 | * it under the terms of the GNU General Public License as published by
|
---|
14 | * the Free Software Foundation; either version 2 of the License, or
|
---|
15 | * (at your option) any later version.
|
---|
16 | *
|
---|
17 | * This program is distributed in the hope that it will be useful,
|
---|
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
20 | * GNU General Public License for more details.
|
---|
21 | *
|
---|
22 | * You should have received a copy of the GNU General Public License
|
---|
23 | * along with this program; if not, write to the Free Software
|
---|
24 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
25 | *########################################################################
|
---|
26 | */
|
---|
27 | package org.greenstone.gatherer.util;
|
---|
28 |
|
---|
29 | import java.util.*;
|
---|
30 |
|
---|
31 | /** Provides a standard, extensible way to convert from one format of string to another (given that each format has differing requirements regarding legal characters and escaped characters)
|
---|
32 | * @author John Thompson, Greenstone Digital Library, University of Waikato
|
---|
33 | * @version 2.3d
|
---|
34 | */
|
---|
35 | public class Codec {
|
---|
36 |
|
---|
37 | static final public String DECODE_PATH = "DECODE_PATH";
|
---|
38 | static final public String DECODE_SQUARE_BRACKETS = "DECODE_SQUARE_BRACKETS";
|
---|
39 | static final public String DOM_TO_GREENSTONE = "DOM_TO_GREENSTONE";
|
---|
40 | static final public String DOM_TO_TEXT = "DOM_TO_TEXT";
|
---|
41 | static final public String ENCODE_PATH = "ENCODE_PATH";
|
---|
42 | static final public String ENCODE_SQUARE_BRACKETS = "ENCODE_SQUARE_BRACKETS";
|
---|
43 | static final public String ESCAPEDHTML_TO_UNESCAPED = "ESCAPEDHTML_TO_UNESCAPED";
|
---|
44 | static final public String GREENSTONE_TO_DOM = "GREENSTONE_TO_DOM";
|
---|
45 | static final public String GREENSTONE_TO_TEXT = "GREENSTONE_TO_TEXT";
|
---|
46 | static final public String TEXT_TO_DOM = "TEXT_TO_DOM";
|
---|
47 | static final public String TEXT_TO_GREENSTONE = "TEXT_TO_GREENSTONE";
|
---|
48 | static final public String TEXT_TO_REGEXP = "TEXT_TO_REGEXP";
|
---|
49 | static final public String TEXT_TO_SHELL_UNIX = "TEXT_TO_SHELL_UNIX";
|
---|
50 | static final public String TEXT_TO_SHELL_WINDOWS = "TEXT_TO_SHELL_WINDOWS";
|
---|
51 |
|
---|
52 | static final private int MAX_CACHE_SIZE = 100;
|
---|
53 |
|
---|
54 | static private HashMap TRANSFORMS;
|
---|
55 | static private HashMap3D CACHE;
|
---|
56 |
|
---|
57 | /** Static function called to construct TRANSFORMS mappings */
|
---|
58 | static {
|
---|
59 | TRANSFORMS = new HashMap();
|
---|
60 |
|
---|
61 | String[] decode_path = {
|
---|
62 | "\\|", "\\\\",
|
---|
63 | "|", "\\|"
|
---|
64 | };
|
---|
65 | TRANSFORMS.put(DECODE_PATH, decode_path);
|
---|
66 | decode_path = null;
|
---|
67 |
|
---|
68 | // Transform text into text, but without [ and ]
|
---|
69 | String[] decode_square_brackets = {
|
---|
70 | "[", "\\[",
|
---|
71 | "]", "\\]"
|
---|
72 | };
|
---|
73 | TRANSFORMS.put(DECODE_SQUARE_BRACKETS, decode_square_brackets);
|
---|
74 | decode_square_brackets = null;
|
---|
75 |
|
---|
76 | // Translate DOM encoded text into Greenstone encoding
|
---|
77 | String[] dom_to_greenstone = {
|
---|
78 | "'", "\\\\\'",
|
---|
79 | ">", ">",
|
---|
80 | "<", "<",
|
---|
81 | """, "\\\\\"",
|
---|
82 | "&", "&"
|
---|
83 | };
|
---|
84 | // removed "\n", "\\\\n", - config files are allowed new lines
|
---|
85 | // added "\\|", "\\\\"
|
---|
86 |
|
---|
87 | TRANSFORMS.put(DOM_TO_GREENSTONE, dom_to_greenstone);
|
---|
88 | dom_to_greenstone = null;
|
---|
89 |
|
---|
90 | // Transform DOM encoded text into plain text
|
---|
91 | String[] dom_to_text = {
|
---|
92 | "&#091;", "\\[",
|
---|
93 | "&#093;", "\\]",
|
---|
94 | "'", "\'",
|
---|
95 | ">", ">",
|
---|
96 | "<", "<",
|
---|
97 | """, "\"",
|
---|
98 | "&", "&"
|
---|
99 | };
|
---|
100 | TRANSFORMS.put(DOM_TO_TEXT, dom_to_text);
|
---|
101 | dom_to_text = null;
|
---|
102 |
|
---|
103 | // Transform text into a regular expression that will match it
|
---|
104 | String[] text_to_regexp = {
|
---|
105 | "\\\\", "\\\\\\\\",
|
---|
106 | "\\(", "\\\\(",
|
---|
107 | "\\)", "\\\\)",
|
---|
108 | "\\[", "\\\\[",
|
---|
109 | "\\]", "\\\\]",
|
---|
110 | "\\{", "\\\\{",
|
---|
111 | "\\}", "\\\\}",
|
---|
112 | "\\.", "\\\\."
|
---|
113 | };
|
---|
114 | TRANSFORMS.put(TEXT_TO_REGEXP, text_to_regexp);
|
---|
115 | text_to_regexp = null;
|
---|
116 |
|
---|
117 | String[] encode_path = {
|
---|
118 | "\\|", "|",
|
---|
119 | "\\\\", "\\|"
|
---|
120 | };
|
---|
121 | TRANSFORMS.put(ENCODE_PATH, encode_path);
|
---|
122 | encode_path = null;
|
---|
123 |
|
---|
124 | // Transform text into text, but without [ and ]
|
---|
125 | String[] encode_square_brackets = {
|
---|
126 | "\\[", "[",
|
---|
127 | "\\]", "]"
|
---|
128 | };
|
---|
129 | TRANSFORMS.put(ENCODE_SQUARE_BRACKETS, encode_square_brackets);
|
---|
130 | encode_square_brackets = null;
|
---|
131 |
|
---|
132 | // Transform Greenstone encoded text to DOM encoding
|
---|
133 | String[] greenstone_to_dom = {
|
---|
134 | "&", "&",
|
---|
135 | "<", "<",
|
---|
136 | ">", ">",
|
---|
137 | "\\\\\"", """,
|
---|
138 | "\\\\\'", "'",
|
---|
139 | "\"", """,
|
---|
140 | "\'", "'"
|
---|
141 | };
|
---|
142 | // removed"\\\\n", "\n", added "\\\\", "\\|"
|
---|
143 |
|
---|
144 | TRANSFORMS.put(GREENSTONE_TO_DOM, greenstone_to_dom);
|
---|
145 | greenstone_to_dom = null;
|
---|
146 |
|
---|
147 | // Transform Greenstone encoded text to plain text
|
---|
148 | String[] greenstone_to_text = {
|
---|
149 | "\\\\\"", "\"",
|
---|
150 | "\\\\\'", "\'",
|
---|
151 | """, "\"",
|
---|
152 | "'", "\'",
|
---|
153 | "[", "\\[",
|
---|
154 | "]", "\\]"
|
---|
155 | };
|
---|
156 | // removed "\\\\n", "\n", "\\|", "\\\\"
|
---|
157 |
|
---|
158 | TRANSFORMS.put(GREENSTONE_TO_TEXT, greenstone_to_text);
|
---|
159 | greenstone_to_text = null;
|
---|
160 |
|
---|
161 | // Transform plain html text into something that can be placed in a DOM
|
---|
162 | String[] text_to_dom = {
|
---|
163 | "&", "&",
|
---|
164 | "<", "<",
|
---|
165 | ">", ">",
|
---|
166 | "\"", """,
|
---|
167 | "\'", "'"
|
---|
168 | };
|
---|
169 | TRANSFORMS.put(TEXT_TO_DOM, text_to_dom);
|
---|
170 | text_to_dom = null;
|
---|
171 |
|
---|
172 | // Unescape html (or xml) text
|
---|
173 | String[] escapedhtml_to_unescaped = {
|
---|
174 | "&", "&",
|
---|
175 | "<", "<",
|
---|
176 | ">", ">",
|
---|
177 | """, "\""//,
|
---|
178 | //"'", "\'"
|
---|
179 | };
|
---|
180 | TRANSFORMS.put(ESCAPEDHTML_TO_UNESCAPED, escapedhtml_to_unescaped);
|
---|
181 | escapedhtml_to_unescaped = null;
|
---|
182 |
|
---|
183 | // Transform plain html text into greenstone encoding
|
---|
184 | String[] text_to_greenstone = {
|
---|
185 |
|
---|
186 | "\\[", "[",
|
---|
187 | "\\]", "]",
|
---|
188 | "\"", """,
|
---|
189 | "\n", "\\\\n"
|
---|
190 | };
|
---|
191 | // "\'", "'",
|
---|
192 | // removed "\\\\", "\\|",
|
---|
193 | TRANSFORMS.put(TEXT_TO_GREENSTONE, text_to_greenstone);
|
---|
194 | text_to_greenstone = null;
|
---|
195 |
|
---|
196 | // Transform plain html text into something that can be placed in a shell command
|
---|
197 | String[] text_to_shell_unix = {
|
---|
198 | "\"", "\\\\\"",
|
---|
199 | "\'", "\\\\\'",
|
---|
200 | "\n", "\\\\n"
|
---|
201 | };
|
---|
202 | TRANSFORMS.put(TEXT_TO_SHELL_UNIX, text_to_shell_unix);
|
---|
203 | text_to_shell_unix = null;
|
---|
204 |
|
---|
205 | // Transform plain html text into something that can be placed in a shell command. Windows requires twice as many escaped for speech marks to be passed to underlying processes
|
---|
206 | String[] text_to_shell_windows = {
|
---|
207 | "\"", "\\\\\\\\\\\\\"",
|
---|
208 | "\'", "\\\\\'",
|
---|
209 | "\n", "\\\\n"
|
---|
210 | };
|
---|
211 | TRANSFORMS.put(TEXT_TO_SHELL_WINDOWS, text_to_shell_windows);
|
---|
212 | text_to_shell_windows = null;
|
---|
213 |
|
---|
214 | CACHE = new HashMap3D();
|
---|
215 | }
|
---|
216 |
|
---|
217 | static public String transform(String raw, String transform) {
|
---|
218 | if(raw == null) {
|
---|
219 | return raw;
|
---|
220 | }
|
---|
221 | // System.err.println("Transforming by "+transform+":\n" + raw);
|
---|
222 | String processed = (String) CACHE.get(transform, raw);
|
---|
223 | if(processed == null) {
|
---|
224 | processed = raw;
|
---|
225 | String[] transforms = (String[]) TRANSFORMS.get(transform);
|
---|
226 | if(transforms != null) {
|
---|
227 | for(int i = 0; i < transforms.length; i = i + 2) {
|
---|
228 | String target = transforms[i];
|
---|
229 | String result = transforms[i+1];
|
---|
230 | processed = processed.replaceAll(target, result);
|
---|
231 | }
|
---|
232 | }
|
---|
233 | //DebugStream.println("\n*** Transform: " + transform + " ***");
|
---|
234 | //DebugStream.println("*** Raw : '" + raw + "'");
|
---|
235 | //DebugStream.println("*** Processed: '" + processed + "'");
|
---|
236 | // If cache is at maximum size, empty it and start again
|
---|
237 | if(CACHE.size() == MAX_CACHE_SIZE) {
|
---|
238 | CACHE.clear();
|
---|
239 | }
|
---|
240 | CACHE.put(transform, raw, processed);
|
---|
241 | }
|
---|
242 | return processed;
|
---|
243 | }
|
---|
244 |
|
---|
245 | /** Transform either of the accepted unicode escape sequences styles from in the string into single characters */
|
---|
246 | static final private char AND_CHAR = '&';
|
---|
247 | static final private char ESCAPE_CHAR = '\\';
|
---|
248 | static final private char HASH_CHAR = '#';
|
---|
249 | static final private char LOWER_U_CHAR = 'u';
|
---|
250 | static final private char UPPER_U_CHAR = 'U';
|
---|
251 | static final private char SEMICOLON_CHAR = ';';
|
---|
252 |
|
---|
253 | static public String transformUnicode(String raw) {
|
---|
254 | StringBuffer processed = new StringBuffer();
|
---|
255 | int index = 0;
|
---|
256 | int raw_length = raw.length();
|
---|
257 | while(index < raw_length) {
|
---|
258 | char c0 = raw.charAt(index);
|
---|
259 | switch(c0) {
|
---|
260 | case AND_CHAR:
|
---|
261 | if(index + 1 < raw_length) {
|
---|
262 | // First the HTML ç type
|
---|
263 | char c1 = raw.charAt(index + 1);
|
---|
264 | if(c1 == HASH_CHAR) {
|
---|
265 | StringBuffer number_str = new StringBuffer();
|
---|
266 | char c2;
|
---|
267 | int offset = 2;
|
---|
268 | while(index + offset < raw_length && (c2 = raw.charAt(index + offset)) != SEMICOLON_CHAR) {
|
---|
269 | number_str.append(c2);
|
---|
270 | offset++;
|
---|
271 | }
|
---|
272 | // We've either run out of characters or have parsed a number
|
---|
273 | if(index + offset < raw_length && raw.charAt(index + offset) == SEMICOLON_CHAR) {
|
---|
274 | int number = Integer.parseInt(number_str.toString());
|
---|
275 | processed.append((char)number);
|
---|
276 | index = index + offset;
|
---|
277 | number_str = null;
|
---|
278 | break;
|
---|
279 | }
|
---|
280 | number_str = null;
|
---|
281 | }
|
---|
282 | }
|
---|
283 | processed.append(c0);
|
---|
284 | break;
|
---|
285 | case ESCAPE_CHAR:
|
---|
286 | // Now the \u00e7 type
|
---|
287 | if(index + 1 < raw_length) {
|
---|
288 | char c3 = raw.charAt(index + 1);
|
---|
289 | if((c3 == UPPER_U_CHAR || c3 == LOWER_U_CHAR) && index + 5 < raw_length) {
|
---|
290 | // We read four digits
|
---|
291 | String hex_str = raw.substring(index + 2, index + 6);
|
---|
292 | int number = Integer.parseInt(hex_str, 16);
|
---|
293 | hex_str = null;
|
---|
294 | processed.append((char)number);
|
---|
295 | index = index + 5;
|
---|
296 | break;
|
---|
297 | }
|
---|
298 | }
|
---|
299 | processed.append(c0);
|
---|
300 | break;
|
---|
301 | default:
|
---|
302 | processed.append(c0);
|
---|
303 | }
|
---|
304 | index++;
|
---|
305 | }
|
---|
306 | return processed.toString();
|
---|
307 | }
|
---|
308 |
|
---|
309 | static public void main(String[] args) {
|
---|
310 | if(args.length < 2) {
|
---|
311 | String processed;
|
---|
312 | String raw;
|
---|
313 | String transform;
|
---|
314 |
|
---|
315 | System.err.println("Running Test Suite");
|
---|
316 |
|
---|
317 | transform = "DOM_TO_GREENSTONE";
|
---|
318 | System.err.println("Test " + transform);
|
---|
319 | raw = "A &lt;\nand a <a href="here.html"><font size='2'>URL</font></a>";
|
---|
320 | System.err.println("Raw: '" + raw + "'");
|
---|
321 | processed = transform(raw, transform);
|
---|
322 | System.err.println("Processed: '" + processed + "'");
|
---|
323 |
|
---|
324 | transform = "DOM_TO_TEXT";
|
---|
325 | System.err.println("Test " + transform);
|
---|
326 | raw = "A &lt;\nand a <a href="here.html"><font size='2'>URL</font></a>";
|
---|
327 | System.err.println("Raw: '" + raw + "'");
|
---|
328 | processed = transform(raw, transform);
|
---|
329 | System.err.println("Processed: '" + processed + "'");
|
---|
330 |
|
---|
331 | transform = "GREENSTONE_TO_DOM";
|
---|
332 | System.err.println("Test " + transform);
|
---|
333 | raw = "A <\\nand a <a href=\\\"here.html\\\"><font size=\\\'2\\\'URL</font></a>";
|
---|
334 | System.err.println("Raw: '" + raw + "'");
|
---|
335 | processed = transform(raw, transform);
|
---|
336 | System.err.println("Processed: '" + processed + "'");
|
---|
337 |
|
---|
338 | transform = "GREENSTONE_TO_TEXT";
|
---|
339 | System.err.println("Test " + transform);
|
---|
340 | raw = "These \\[ \\] should be escaped, and so should \\\\ that. These " ' \\n are encoded.";
|
---|
341 | System.err.println("Raw: '" + raw + "'");
|
---|
342 | processed = transform(raw, transform);
|
---|
343 | System.err.println("Processed: '" + processed + "'");
|
---|
344 |
|
---|
345 | transform = "TEXT_TO_DOM";
|
---|
346 | System.err.println("Test " + transform);
|
---|
347 | raw = "A <\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
|
---|
348 | System.err.println("Raw: '" + raw + "'");
|
---|
349 | processed = transform(raw, transform);
|
---|
350 | System.err.println("Processed: '" + processed + "'");
|
---|
351 |
|
---|
352 | transform = "TEXT_TO_GREENSTONE";
|
---|
353 | System.err.println("Test " + transform);
|
---|
354 | raw = "These [ ] should be escaped, and so should \\ that. These \" \' \n are encoded.";
|
---|
355 | System.err.println("Raw: '" + raw + "'");
|
---|
356 | processed = transform(raw, transform);
|
---|
357 | System.err.println("Processed: '" + processed + "'");
|
---|
358 |
|
---|
359 | transform = "TEXT_TO_SHELL";
|
---|
360 | System.err.println("Test " + transform);
|
---|
361 | if(Utility.isWindows()) {
|
---|
362 | System.err.println("[Windows Version]");
|
---|
363 | transform = "TEXT_TO_SHELL_WINDOWS";
|
---|
364 | }
|
---|
365 | else {
|
---|
366 | System.err.println("[Unix Version]");
|
---|
367 | transform = "TEXT_TO_SHELL_UNIX";
|
---|
368 | }
|
---|
369 | raw = "A <\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
|
---|
370 | System.err.println("Raw: '" + raw + "'");
|
---|
371 | processed = transform(raw, transform);
|
---|
372 | System.err.println("Processed: '" + processed + "'");
|
---|
373 |
|
---|
374 | System.err.println("***** UNICODE TEST *****");
|
---|
375 | System.err.println("\\u0030 => " + transformUnicode("\\u0030"));
|
---|
376 | System.err.println("\\u0041 => " + transformUnicode("\\u0041"));
|
---|
377 | System.err.println("\\u007a => " + transformUnicode("\\u007a"));
|
---|
378 | System.err.println("\\u00e7 => " + transformUnicode("\\u00e7"));
|
---|
379 | System.err.println("0 => " + transformUnicode("0"));
|
---|
380 | System.err.println("A => " + transformUnicode("A"));
|
---|
381 | System.err.println("z => " + transformUnicode("z"));
|
---|
382 | System.err.println("ç => " + transformUnicode("ç"));
|
---|
383 | }
|
---|
384 | else {
|
---|
385 | System.err.println("Raw: '" + args[0] + "'");
|
---|
386 | System.err.println("Transform: " + args[1]);
|
---|
387 | String processed = transform(args[0], args[1]);
|
---|
388 | System.err.println("Processed: '" + processed + "'");
|
---|
389 | }
|
---|
390 | }
|
---|
391 | }
|
---|