1 | /**
|
---|
2 | *#########################################################################
|
---|
3 | *
|
---|
4 | * A component of the Gatherer application, part of the Greenstone digital
|
---|
5 | * library suite from the New Zealand Digital Library Project at the
|
---|
6 | * University of Waikato, New Zealand.
|
---|
7 | *
|
---|
8 | * Author: John Thompson, Greenstone Digital Library, University of Waikato
|
---|
9 | *
|
---|
10 | * Copyright (C) 1999 New Zealand Digital Library Project
|
---|
11 | *
|
---|
12 | * This program is free software; you can redistribute it and/or modify
|
---|
13 | * it under the terms of the GNU General Public License as published by
|
---|
14 | * the Free Software Foundation; either version 2 of the License, or
|
---|
15 | * (at your option) any later version.
|
---|
16 | *
|
---|
17 | * This program is distributed in the hope that it will be useful,
|
---|
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
20 | * GNU General Public License for more details.
|
---|
21 | *
|
---|
22 | * You should have received a copy of the GNU General Public License
|
---|
23 | * along with this program; if not, write to the Free Software
|
---|
24 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
25 | *########################################################################
|
---|
26 | */
|
---|
27 | package org.greenstone.gatherer.util;
|
---|
28 |
|
---|
29 | import java.util.*;
|
---|
30 |
|
---|
31 | /** Provides a standard, extensible way to convert from one format of string to another (given that each format has differing requirements regarding legal characters and escaped characters)
|
---|
32 | * @author John Thompson, Greenstone Digital Library, University of Waikato
|
---|
33 | * @version 2.3d
|
---|
34 | */
|
---|
35 | public class Codec {
|
---|
36 |
|
---|
37 | static final public String DECODE_PATH = "DECODE_PATH";
|
---|
38 | static final public String DECODE_SQUARE_BRACKETS = "DECODE_SQUARE_BRACKETS";
|
---|
39 | static final public String DOM_TO_GREENSTONE = "DOM_TO_GREENSTONE";
|
---|
40 | static final public String DOM_TO_TEXT = "DOM_TO_TEXT";
|
---|
41 | static final public String ENCODE_PATH = "ENCODE_PATH";
|
---|
42 | static final public String ENCODE_SQUARE_BRACKETS = "ENCODE_SQUARE_BRACKETS";
|
---|
43 | static final public String GREENSTONE_TO_DOM = "GREENSTONE_TO_DOM";
|
---|
44 | static final public String GREENSTONE_TO_TEXT = "GREENSTONE_TO_TEXT";
|
---|
45 | static final public String TEXT_TO_DOM = "TEXT_TO_DOM";
|
---|
46 | static final public String TEXT_TO_GREENSTONE = "TEXT_TO_GREENSTONE";
|
---|
47 | static final public String TEXT_TO_REGEXP = "TEXT_TO_REGEXP";
|
---|
48 | static final public String TEXT_TO_SHELL_UNIX = "TEXT_TO_SHELL_UNIX";
|
---|
49 | static final public String TEXT_TO_SHELL_WINDOWS = "TEXT_TO_SHELL_WINDOWS";
|
---|
50 |
|
---|
51 | static final private int MAX_CACHE_SIZE = 100;
|
---|
52 |
|
---|
53 | static private HashMap TRANSFORMS;
|
---|
54 | static private HashMap3D CACHE;
|
---|
55 |
|
---|
56 | /** Static function called to construct TRANSFORMS mappings */
|
---|
57 | static {
|
---|
58 | TRANSFORMS = new HashMap();
|
---|
59 |
|
---|
60 | String[] decode_path = {
|
---|
61 | "\\|", "\\\\",
|
---|
62 | "|", "\\|"
|
---|
63 | };
|
---|
64 | TRANSFORMS.put(DECODE_PATH, decode_path);
|
---|
65 | decode_path = null;
|
---|
66 |
|
---|
67 | // Transform text into text, but without [ and ]
|
---|
68 | String[] decode_square_brackets = {
|
---|
69 | "[", "\\[",
|
---|
70 | "]", "\\]"
|
---|
71 | };
|
---|
72 | TRANSFORMS.put(DECODE_SQUARE_BRACKETS, decode_square_brackets);
|
---|
73 | decode_square_brackets = null;
|
---|
74 |
|
---|
75 | // Translate DOM encoded text into Greenstone encoding
|
---|
76 | String[] dom_to_greenstone = {
|
---|
77 | "'", "\\\\\'",
|
---|
78 | ">", ">",
|
---|
79 | "<", "<",
|
---|
80 | """, "\\\\\"",
|
---|
81 | "&", "&"
|
---|
82 | };
|
---|
83 | // removed "\n", "\\\\n", - config files are allowed new lines
|
---|
84 | // added "\\|", "\\\\"
|
---|
85 |
|
---|
86 | TRANSFORMS.put(DOM_TO_GREENSTONE, dom_to_greenstone);
|
---|
87 | dom_to_greenstone = null;
|
---|
88 |
|
---|
89 | // Transform DOM encoded text into plain text
|
---|
90 | String[] dom_to_text = {
|
---|
91 | "&#091;", "\\[",
|
---|
92 | "&#093;", "\\]",
|
---|
93 | "'", "\'",
|
---|
94 | ">", ">",
|
---|
95 | "<", "<",
|
---|
96 | """, "\"",
|
---|
97 | "&", "&"
|
---|
98 | };
|
---|
99 | TRANSFORMS.put(DOM_TO_TEXT, dom_to_text);
|
---|
100 | dom_to_text = null;
|
---|
101 |
|
---|
102 | // Transform text into a regular expression that will match it
|
---|
103 | String[] text_to_regexp = {
|
---|
104 | "\\\\", "\\\\\\\\",
|
---|
105 | "\\(", "\\\\(",
|
---|
106 | "\\)", "\\\\)",
|
---|
107 | "\\[", "\\\\[",
|
---|
108 | "\\]", "\\\\]",
|
---|
109 | "\\{", "\\\\{",
|
---|
110 | "\\}", "\\\\}",
|
---|
111 | "\\.", "\\\\."
|
---|
112 | };
|
---|
113 | TRANSFORMS.put(TEXT_TO_REGEXP, text_to_regexp);
|
---|
114 | text_to_regexp = null;
|
---|
115 |
|
---|
116 | String[] encode_path = {
|
---|
117 | "\\|", "|",
|
---|
118 | "\\\\", "\\|"
|
---|
119 | };
|
---|
120 | TRANSFORMS.put(ENCODE_PATH, encode_path);
|
---|
121 | encode_path = null;
|
---|
122 |
|
---|
123 | // Transform text into text, but without [ and ]
|
---|
124 | String[] encode_square_brackets = {
|
---|
125 | "\\[", "[",
|
---|
126 | "\\]", "]"
|
---|
127 | };
|
---|
128 | TRANSFORMS.put(ENCODE_SQUARE_BRACKETS, encode_square_brackets);
|
---|
129 | encode_square_brackets = null;
|
---|
130 |
|
---|
131 | // Transform Greenstone encoded text to DOM encoding
|
---|
132 | String[] greenstone_to_dom = {
|
---|
133 | "&", "&",
|
---|
134 | "<", "<",
|
---|
135 | ">", ">",
|
---|
136 | "\\\\\"", """,
|
---|
137 | "\\\\\'", "'",
|
---|
138 | "\"", """,
|
---|
139 | "\'", "'"
|
---|
140 | };
|
---|
141 | // removed"\\\\n", "\n", added "\\\\", "\\|"
|
---|
142 |
|
---|
143 | TRANSFORMS.put(GREENSTONE_TO_DOM, greenstone_to_dom);
|
---|
144 | greenstone_to_dom = null;
|
---|
145 |
|
---|
146 | // Transform Greenstone encoded text to plain text
|
---|
147 | String[] greenstone_to_text = {
|
---|
148 | "\\\\\"", "\"",
|
---|
149 | "\\\\\'", "\'",
|
---|
150 | """, "\"",
|
---|
151 | "'", "\'",
|
---|
152 | "[", "\\[",
|
---|
153 | "]", "\\]"
|
---|
154 | };
|
---|
155 | // removed "\\\\n", "\n", "\\|", "\\\\"
|
---|
156 |
|
---|
157 | TRANSFORMS.put(GREENSTONE_TO_TEXT, greenstone_to_text);
|
---|
158 | greenstone_to_text = null;
|
---|
159 |
|
---|
160 | // Transform plain html text into something that can be placed in a DOM
|
---|
161 | String[] text_to_dom = {
|
---|
162 | "&", "&",
|
---|
163 | "<", "<",
|
---|
164 | ">", ">",
|
---|
165 | "\"", """,
|
---|
166 | "\'", "'"
|
---|
167 | };
|
---|
168 | TRANSFORMS.put(TEXT_TO_DOM, text_to_dom);
|
---|
169 | text_to_dom = null;
|
---|
170 |
|
---|
171 | // Transform plain html text into greenstone encoding
|
---|
172 | String[] text_to_greenstone = {
|
---|
173 |
|
---|
174 | "\\[", "[",
|
---|
175 | "\\]", "]",
|
---|
176 | "\"", """,
|
---|
177 | "\n", "\\\\n"
|
---|
178 | };
|
---|
179 | // "\'", "'",
|
---|
180 | // removed "\\\\", "\\|",
|
---|
181 | TRANSFORMS.put(TEXT_TO_GREENSTONE, text_to_greenstone);
|
---|
182 | text_to_greenstone = null;
|
---|
183 |
|
---|
184 | // Transform plain html text into something that can be placed in a shell command
|
---|
185 | String[] text_to_shell_unix = {
|
---|
186 | "\"", "\\\\\"",
|
---|
187 | "\'", "\\\\\'",
|
---|
188 | "\n", "\\\\n"
|
---|
189 | };
|
---|
190 | TRANSFORMS.put(TEXT_TO_SHELL_UNIX, text_to_shell_unix);
|
---|
191 | text_to_shell_unix = null;
|
---|
192 |
|
---|
193 | // Transform plain html text into something that can be placed in a shell command. Windows requires twice as many escaped for speech marks to be passed to underlying processes
|
---|
194 | String[] text_to_shell_windows = {
|
---|
195 | "\"", "\\\\\\\\\\\\\"",
|
---|
196 | "\'", "\\\\\'",
|
---|
197 | "\n", "\\\\n"
|
---|
198 | };
|
---|
199 | TRANSFORMS.put(TEXT_TO_SHELL_WINDOWS, text_to_shell_windows);
|
---|
200 | text_to_shell_windows = null;
|
---|
201 |
|
---|
202 | CACHE = new HashMap3D();
|
---|
203 | }
|
---|
204 |
|
---|
205 | static public String transform(String raw, String transform) {
|
---|
206 | if(raw == null) {
|
---|
207 | return raw;
|
---|
208 | }
|
---|
209 | // System.err.println("Transforming by "+transform+":\n" + raw);
|
---|
210 | String processed = (String) CACHE.get(transform, raw);
|
---|
211 | if(processed == null) {
|
---|
212 | processed = raw;
|
---|
213 | String[] transforms = (String[]) TRANSFORMS.get(transform);
|
---|
214 | if(transforms != null) {
|
---|
215 | for(int i = 0; i < transforms.length; i = i + 2) {
|
---|
216 | String target = transforms[i];
|
---|
217 | String result = transforms[i+1];
|
---|
218 | processed = processed.replaceAll(target, result);
|
---|
219 | }
|
---|
220 | }
|
---|
221 | //DebugStream.println("\n*** Transform: " + transform + " ***");
|
---|
222 | //DebugStream.println("*** Raw : '" + raw + "'");
|
---|
223 | //DebugStream.println("*** Processed: '" + processed + "'");
|
---|
224 | // If cache is at maximum size, empty it and start again
|
---|
225 | if(CACHE.size() == MAX_CACHE_SIZE) {
|
---|
226 | CACHE.clear();
|
---|
227 | }
|
---|
228 | CACHE.put(transform, raw, processed);
|
---|
229 | }
|
---|
230 | return processed;
|
---|
231 | }
|
---|
232 |
|
---|
233 | /** Transform either of the accepted unicode escape sequences styles from in the string into single characters */
|
---|
234 | static final private char AND_CHAR = '&';
|
---|
235 | static final private char ESCAPE_CHAR = '\\';
|
---|
236 | static final private char HASH_CHAR = '#';
|
---|
237 | static final private char LOWER_U_CHAR = 'u';
|
---|
238 | static final private char UPPER_U_CHAR = 'U';
|
---|
239 | static final private char SEMICOLON_CHAR = ';';
|
---|
240 |
|
---|
241 | static public String transformUnicode(String raw) {
|
---|
242 | StringBuffer processed = new StringBuffer();
|
---|
243 | int index = 0;
|
---|
244 | int raw_length = raw.length();
|
---|
245 | while(index < raw_length) {
|
---|
246 | char c0 = raw.charAt(index);
|
---|
247 | switch(c0) {
|
---|
248 | case AND_CHAR:
|
---|
249 | if(index + 1 < raw_length) {
|
---|
250 | // First the HTML ç type
|
---|
251 | char c1 = raw.charAt(index + 1);
|
---|
252 | if(c1 == HASH_CHAR) {
|
---|
253 | StringBuffer number_str = new StringBuffer();
|
---|
254 | char c2;
|
---|
255 | int offset = 2;
|
---|
256 | while(index + offset < raw_length && (c2 = raw.charAt(index + offset)) != SEMICOLON_CHAR) {
|
---|
257 | number_str.append(c2);
|
---|
258 | offset++;
|
---|
259 | }
|
---|
260 | // We've either run out of characters or have parsed a number
|
---|
261 | if(index + offset < raw_length && raw.charAt(index + offset) == SEMICOLON_CHAR) {
|
---|
262 | int number = Integer.parseInt(number_str.toString());
|
---|
263 | processed.append((char)number);
|
---|
264 | index = index + offset;
|
---|
265 | number_str = null;
|
---|
266 | break;
|
---|
267 | }
|
---|
268 | number_str = null;
|
---|
269 | }
|
---|
270 | }
|
---|
271 | processed.append(c0);
|
---|
272 | break;
|
---|
273 | case ESCAPE_CHAR:
|
---|
274 | // Now the \u00e7 type
|
---|
275 | if(index + 1 < raw_length) {
|
---|
276 | char c3 = raw.charAt(index + 1);
|
---|
277 | if((c3 == UPPER_U_CHAR || c3 == LOWER_U_CHAR) && index + 5 < raw_length) {
|
---|
278 | // We read four digits
|
---|
279 | String hex_str = raw.substring(index + 2, index + 6);
|
---|
280 | int number = Integer.parseInt(hex_str, 16);
|
---|
281 | hex_str = null;
|
---|
282 | processed.append((char)number);
|
---|
283 | index = index + 5;
|
---|
284 | break;
|
---|
285 | }
|
---|
286 | }
|
---|
287 | processed.append(c0);
|
---|
288 | break;
|
---|
289 | default:
|
---|
290 | processed.append(c0);
|
---|
291 | }
|
---|
292 | index++;
|
---|
293 | }
|
---|
294 | return processed.toString();
|
---|
295 | }
|
---|
296 |
|
---|
297 | static public void main(String[] args) {
|
---|
298 | if(args.length < 2) {
|
---|
299 | String processed;
|
---|
300 | String raw;
|
---|
301 | String transform;
|
---|
302 |
|
---|
303 | System.err.println("Running Test Suite");
|
---|
304 |
|
---|
305 | transform = "DOM_TO_GREENSTONE";
|
---|
306 | System.err.println("Test " + transform);
|
---|
307 | raw = "A &lt;\nand a <a href="here.html"><font size='2'>URL</font></a>";
|
---|
308 | System.err.println("Raw: '" + raw + "'");
|
---|
309 | processed = transform(raw, transform);
|
---|
310 | System.err.println("Processed: '" + processed + "'");
|
---|
311 |
|
---|
312 | transform = "DOM_TO_TEXT";
|
---|
313 | System.err.println("Test " + transform);
|
---|
314 | raw = "A &lt;\nand a <a href="here.html"><font size='2'>URL</font></a>";
|
---|
315 | System.err.println("Raw: '" + raw + "'");
|
---|
316 | processed = transform(raw, transform);
|
---|
317 | System.err.println("Processed: '" + processed + "'");
|
---|
318 |
|
---|
319 | transform = "GREENSTONE_TO_DOM";
|
---|
320 | System.err.println("Test " + transform);
|
---|
321 | raw = "A <\\nand a <a href=\\\"here.html\\\"><font size=\\\'2\\\'URL</font></a>";
|
---|
322 | System.err.println("Raw: '" + raw + "'");
|
---|
323 | processed = transform(raw, transform);
|
---|
324 | System.err.println("Processed: '" + processed + "'");
|
---|
325 |
|
---|
326 | transform = "GREENSTONE_TO_TEXT";
|
---|
327 | System.err.println("Test " + transform);
|
---|
328 | raw = "These \\[ \\] should be escaped, and so should \\\\ that. These " ' \\n are encoded.";
|
---|
329 | System.err.println("Raw: '" + raw + "'");
|
---|
330 | processed = transform(raw, transform);
|
---|
331 | System.err.println("Processed: '" + processed + "'");
|
---|
332 |
|
---|
333 | transform = "TEXT_TO_DOM";
|
---|
334 | System.err.println("Test " + transform);
|
---|
335 | raw = "A <\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
|
---|
336 | System.err.println("Raw: '" + raw + "'");
|
---|
337 | processed = transform(raw, transform);
|
---|
338 | System.err.println("Processed: '" + processed + "'");
|
---|
339 |
|
---|
340 | transform = "TEXT_TO_GREENSTONE";
|
---|
341 | System.err.println("Test " + transform);
|
---|
342 | raw = "These [ ] should be escaped, and so should \\ that. These \" \' \n are encoded.";
|
---|
343 | System.err.println("Raw: '" + raw + "'");
|
---|
344 | processed = transform(raw, transform);
|
---|
345 | System.err.println("Processed: '" + processed + "'");
|
---|
346 |
|
---|
347 | transform = "TEXT_TO_SHELL";
|
---|
348 | System.err.println("Test " + transform);
|
---|
349 | if(Utility.isWindows()) {
|
---|
350 | System.err.println("[Windows Version]");
|
---|
351 | transform = "TEXT_TO_SHELL_WINDOWS";
|
---|
352 | }
|
---|
353 | else {
|
---|
354 | System.err.println("[Unix Version]");
|
---|
355 | transform = "TEXT_TO_SHELL_UNIX";
|
---|
356 | }
|
---|
357 | raw = "A <\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
|
---|
358 | System.err.println("Raw: '" + raw + "'");
|
---|
359 | processed = transform(raw, transform);
|
---|
360 | System.err.println("Processed: '" + processed + "'");
|
---|
361 |
|
---|
362 | System.err.println("***** UNICODE TEST *****");
|
---|
363 | System.err.println("\\u0030 => " + transformUnicode("\\u0030"));
|
---|
364 | System.err.println("\\u0041 => " + transformUnicode("\\u0041"));
|
---|
365 | System.err.println("\\u007a => " + transformUnicode("\\u007a"));
|
---|
366 | System.err.println("\\u00e7 => " + transformUnicode("\\u00e7"));
|
---|
367 | System.err.println("0 => " + transformUnicode("0"));
|
---|
368 | System.err.println("A => " + transformUnicode("A"));
|
---|
369 | System.err.println("z => " + transformUnicode("z"));
|
---|
370 | System.err.println("ç => " + transformUnicode("ç"));
|
---|
371 | }
|
---|
372 | else {
|
---|
373 | System.err.println("Raw: '" + args[0] + "'");
|
---|
374 | System.err.println("Transform: " + args[1]);
|
---|
375 | String processed = transform(args[0], args[1]);
|
---|
376 | System.err.println("Processed: '" + processed + "'");
|
---|
377 | }
|
---|
378 | }
|
---|
379 | }
|
---|