1 | /**
|
---|
2 | *#########################################################################
|
---|
3 | *
|
---|
4 | * A component of the Gatherer application, part of the Greenstone digital
|
---|
5 | * library suite from the New Zealand Digital Library Project at the
|
---|
6 | * University of Waikato, New Zealand.
|
---|
7 | *
|
---|
8 | * Author: John Thompson, Greenstone Digital Library, University of Waikato
|
---|
9 | *
|
---|
10 | * Copyright (C) 1999 New Zealand Digital Library Project
|
---|
11 | *
|
---|
12 | * This program is free software; you can redistribute it and/or modify
|
---|
13 | * it under the terms of the GNU General Public License as published by
|
---|
14 | * the Free Software Foundation; either version 2 of the License, or
|
---|
15 | * (at your option) any later version.
|
---|
16 | *
|
---|
17 | * This program is distributed in the hope that it will be useful,
|
---|
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
20 | * GNU General Public License for more details.
|
---|
21 | *
|
---|
22 | * You should have received a copy of the GNU General Public License
|
---|
23 | * along with this program; if not, write to the Free Software
|
---|
24 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
25 | *########################################################################
|
---|
26 | */
|
---|
27 | package org.greenstone.gatherer.util;
|
---|
28 | /*************************************************************************
|
---|
29 | * Written: 17-08-03
|
---|
30 | ************************************************************************/
|
---|
31 | import java.util.*;
|
---|
32 | import org.greenstone.gatherer.Gatherer;
|
---|
33 | import org.greenstone.gatherer.util.Utility;
|
---|
34 | /** Provides a standard, extensible way to convert from one format of string to another (given that each format has differing requirements regarding legal characters and escaped characters)
|
---|
35 | * @author John Thompson, Greenstone Digital Library, University of Waikato
|
---|
36 | * @version 2.3d
|
---|
37 | */
|
---|
38 | public class Codec {
|
---|
39 |
|
---|
40 | static final public String DECODE_PATH = "DECODE_PATH";
|
---|
41 | static final public String DECODE_SQUARE_BRACKETS = "DECODE_SQUARE_BRACKETS";
|
---|
42 | static final public String DOM_TO_GREENSTONE = "DOM_TO_GREENSTONE";
|
---|
43 | static final public String DOM_TO_TEXT = "DOM_TO_TEXT";
|
---|
44 | static final public String ENCODE_PATH = "ENCODE_PATH";
|
---|
45 | static final public String ENCODE_SQUARE_BRACKETS = "ENCODE_SQUARE_BRACKETS";
|
---|
46 | static final public String GREENSTONE_TO_DOM = "GREENSTONE_TO_DOM";
|
---|
47 | static final public String GREENSTONE_TO_TEXT = "GREENSTONE_TO_TEXT";
|
---|
48 | static final public String TEXT_TO_DOM = "TEXT_TO_DOM";
|
---|
49 | static final public String TEXT_TO_GREENSTONE = "TEXT_TO_GREENSTONE";
|
---|
50 | static final public String TEXT_TO_REGEXP = "TEXT_TO_REGEXP";
|
---|
51 | static final public String TEXT_TO_SHELL_UNIX = "TEXT_TO_SHELL_UNIX";
|
---|
52 | static final public String TEXT_TO_SHELL_WINDOWS = "TEXT_TO_SHELL_WINDOWS";
|
---|
53 |
|
---|
54 | static final private int MAX_CACHE_SIZE = 100;
|
---|
55 |
|
---|
56 | static private HashMap TRANSFORMS;
|
---|
57 | static private HashMap3D CACHE;
|
---|
58 |
|
---|
59 | /** Static function called to construct TRANSFORMS mappings */
|
---|
60 | static {
|
---|
61 | TRANSFORMS = new HashMap();
|
---|
62 |
|
---|
63 | String[] decode_path = {
|
---|
64 | "\\|", "\\\\",
|
---|
65 | "|", "\\|"
|
---|
66 | };
|
---|
67 | TRANSFORMS.put(DECODE_PATH, decode_path);
|
---|
68 | decode_path = null;
|
---|
69 |
|
---|
70 | // Transform text into text, but without [ and ]
|
---|
71 | String[] decode_square_brackets = {
|
---|
72 | "[", "\\[",
|
---|
73 | "]", "\\]"
|
---|
74 | };
|
---|
75 | TRANSFORMS.put(DECODE_SQUARE_BRACKETS, decode_square_brackets);
|
---|
76 | decode_square_brackets = null;
|
---|
77 |
|
---|
78 | // Translate DOM encoded text into Greenstone encoding
|
---|
79 | String[] dom_to_greenstone = {
|
---|
80 | "'", "\\\\\'",
|
---|
81 | ">", ">",
|
---|
82 | "<", "<",
|
---|
83 | """, "\\\\\"",
|
---|
84 | "&", "&"
|
---|
85 | };
|
---|
86 | // removed "\n", "\\\\n", - config files are allowed new lines
|
---|
87 | // added "\\|", "\\\\"
|
---|
88 |
|
---|
89 | TRANSFORMS.put(DOM_TO_GREENSTONE, dom_to_greenstone);
|
---|
90 | dom_to_greenstone = null;
|
---|
91 |
|
---|
92 | // Transform DOM encoded text into plain text
|
---|
93 | String[] dom_to_text = {
|
---|
94 | "&#091;", "\\[",
|
---|
95 | "&#093;", "\\]",
|
---|
96 | "'", "\'",
|
---|
97 | ">", ">",
|
---|
98 | "<", "<",
|
---|
99 | """, "\"",
|
---|
100 | "&", "&"
|
---|
101 | };
|
---|
102 | TRANSFORMS.put(DOM_TO_TEXT, dom_to_text);
|
---|
103 | dom_to_text = null;
|
---|
104 |
|
---|
105 | // Transform text into a regular expression that will match it
|
---|
106 | String[] text_to_regexp = {
|
---|
107 | "\\\\", "\\\\\\\\",
|
---|
108 | "\\(", "\\\\(",
|
---|
109 | "\\)", "\\\\)",
|
---|
110 | "\\[", "\\\\[",
|
---|
111 | "\\]", "\\\\]",
|
---|
112 | "\\{", "\\\\{",
|
---|
113 | "\\}", "\\\\}",
|
---|
114 | "\\.", "\\\\."
|
---|
115 | };
|
---|
116 | TRANSFORMS.put(TEXT_TO_REGEXP, text_to_regexp);
|
---|
117 | text_to_regexp = null;
|
---|
118 |
|
---|
119 | String[] encode_path = {
|
---|
120 | "\\|", "|",
|
---|
121 | "\\\\", "\\|"
|
---|
122 | };
|
---|
123 | TRANSFORMS.put(ENCODE_PATH, encode_path);
|
---|
124 | encode_path = null;
|
---|
125 |
|
---|
126 | // Transform text into text, but without [ and ]
|
---|
127 | String[] encode_square_brackets = {
|
---|
128 | "\\[", "[",
|
---|
129 | "\\]", "]"
|
---|
130 | };
|
---|
131 | TRANSFORMS.put(ENCODE_SQUARE_BRACKETS, encode_square_brackets);
|
---|
132 | encode_square_brackets = null;
|
---|
133 |
|
---|
134 | // Transform Greenstone encoded text to DOM encoding
|
---|
135 | String[] greenstone_to_dom = {
|
---|
136 | "&", "&",
|
---|
137 | "<", "<",
|
---|
138 | ">", ">",
|
---|
139 | "\\\\\"", """,
|
---|
140 | "\\\\\'", "'",
|
---|
141 | "\"", """,
|
---|
142 | "\'", "'"
|
---|
143 | };
|
---|
144 | // removed"\\\\n", "\n", added "\\\\", "\\|"
|
---|
145 |
|
---|
146 | TRANSFORMS.put(GREENSTONE_TO_DOM, greenstone_to_dom);
|
---|
147 | greenstone_to_dom = null;
|
---|
148 |
|
---|
149 | // Transform Greenstone encoded text to plain text
|
---|
150 | String[] greenstone_to_text = {
|
---|
151 | "\\\\\"", "\"",
|
---|
152 | "\\\\\'", "\'",
|
---|
153 | """, "\"",
|
---|
154 | "'", "\'",
|
---|
155 | "[", "\\[",
|
---|
156 | "]", "\\]"
|
---|
157 | };
|
---|
158 | // removed "\\\\n", "\n", "\\|", "\\\\"
|
---|
159 |
|
---|
160 | TRANSFORMS.put(GREENSTONE_TO_TEXT, greenstone_to_text);
|
---|
161 | greenstone_to_text = null;
|
---|
162 |
|
---|
163 | // Transform plain html text into something that can be placed in a DOM
|
---|
164 | String[] text_to_dom = {
|
---|
165 | "&", "&",
|
---|
166 | "<", "<",
|
---|
167 | ">", ">",
|
---|
168 | "\"", """,
|
---|
169 | "\'", "'"
|
---|
170 | };
|
---|
171 | TRANSFORMS.put(TEXT_TO_DOM, text_to_dom);
|
---|
172 | text_to_dom = null;
|
---|
173 |
|
---|
174 | // Transform plain html text into greenstone encoding
|
---|
175 | String[] text_to_greenstone = {
|
---|
176 |
|
---|
177 | "\\[", "[",
|
---|
178 | "\\]", "]",
|
---|
179 | "\"", """,
|
---|
180 | "\n", "\\\\n"
|
---|
181 | };
|
---|
182 | // "\'", "'",
|
---|
183 | // removed "\\\\", "\\|",
|
---|
184 | TRANSFORMS.put(TEXT_TO_GREENSTONE, text_to_greenstone);
|
---|
185 | text_to_greenstone = null;
|
---|
186 |
|
---|
187 | // Transform plain html text into something that can be placed in a shell command
|
---|
188 | String[] text_to_shell_unix = {
|
---|
189 | "\"", "\\\\\"",
|
---|
190 | "\'", "\\\\\'",
|
---|
191 | "\n", "\\\\n"
|
---|
192 | };
|
---|
193 | TRANSFORMS.put(TEXT_TO_SHELL_UNIX, text_to_shell_unix);
|
---|
194 | text_to_shell_unix = null;
|
---|
195 |
|
---|
196 | // Transform plain html text into something that can be placed in a shell command. Windows requires twice as many escaped for speech marks to be passed to underlying processes
|
---|
197 | String[] text_to_shell_windows = {
|
---|
198 | "\"", "\\\\\\\\\\\\\"",
|
---|
199 | "\'", "\\\\\'",
|
---|
200 | "\n", "\\\\n"
|
---|
201 | };
|
---|
202 | TRANSFORMS.put(TEXT_TO_SHELL_WINDOWS, text_to_shell_windows);
|
---|
203 | text_to_shell_windows = null;
|
---|
204 |
|
---|
205 | CACHE = new HashMap3D();
|
---|
206 | }
|
---|
207 |
|
---|
208 | static public String transform(String raw, String transform) {
|
---|
209 | if(raw == null) {
|
---|
210 | return raw;
|
---|
211 | }
|
---|
212 | // System.err.println("Transforming by "+transform+":\n" + raw);
|
---|
213 | String processed = (String) CACHE.get(transform, raw);
|
---|
214 | if(processed == null) {
|
---|
215 | processed = raw;
|
---|
216 | String[] transforms = (String[]) TRANSFORMS.get(transform);
|
---|
217 | if(transforms != null) {
|
---|
218 | for(int i = 0; i < transforms.length; i = i + 2) {
|
---|
219 | String target = transforms[i];
|
---|
220 | String result = transforms[i+1];
|
---|
221 | processed = processed.replaceAll(target, result);
|
---|
222 | }
|
---|
223 | }
|
---|
224 | Gatherer.println("\n*** Transform: " + transform + " ***");
|
---|
225 | Gatherer.println("*** Raw : '" + raw + "'");
|
---|
226 | Gatherer.println("*** Processed: '" + processed + "'");
|
---|
227 | // If cache is at maximum size, empty it and start again
|
---|
228 | if(CACHE.size() == MAX_CACHE_SIZE) {
|
---|
229 | CACHE.clear();
|
---|
230 | }
|
---|
231 | CACHE.put(transform, raw, processed);
|
---|
232 | }
|
---|
233 | return processed;
|
---|
234 | }
|
---|
235 |
|
---|
236 | /** Transform either of the accepted unicode escape sequences styles from in the string into single characters */
|
---|
237 | static final private char AND_CHAR = '&';
|
---|
238 | static final private char ESCAPE_CHAR = '\\';
|
---|
239 | static final private char HASH_CHAR = '#';
|
---|
240 | static final private char LOWER_U_CHAR = 'u';
|
---|
241 | static final private char UPPER_U_CHAR = 'U';
|
---|
242 | static final private char SEMICOLON_CHAR = ';';
|
---|
243 |
|
---|
244 | static public String transformUnicode(String raw) {
|
---|
245 | StringBuffer processed = new StringBuffer();
|
---|
246 | int index = 0;
|
---|
247 | int raw_length = raw.length();
|
---|
248 | while(index < raw_length) {
|
---|
249 | char c0 = raw.charAt(index);
|
---|
250 | switch(c0) {
|
---|
251 | case AND_CHAR:
|
---|
252 | if(index + 1 < raw_length) {
|
---|
253 | // First the HTML ç type
|
---|
254 | char c1 = raw.charAt(index + 1);
|
---|
255 | if(c1 == HASH_CHAR) {
|
---|
256 | StringBuffer number_str = new StringBuffer();
|
---|
257 | char c2;
|
---|
258 | int offset = 2;
|
---|
259 | while(index + offset < raw_length && (c2 = raw.charAt(index + offset)) != SEMICOLON_CHAR) {
|
---|
260 | number_str.append(c2);
|
---|
261 | offset++;
|
---|
262 | }
|
---|
263 | // We've either run out of characters or have parsed a number
|
---|
264 | if(index + offset < raw_length && raw.charAt(index + offset) == SEMICOLON_CHAR) {
|
---|
265 | int number = Integer.parseInt(number_str.toString());
|
---|
266 | processed.append((char)number);
|
---|
267 | index = index + offset;
|
---|
268 | number_str = null;
|
---|
269 | break;
|
---|
270 | }
|
---|
271 | number_str = null;
|
---|
272 | }
|
---|
273 | }
|
---|
274 | processed.append(c0);
|
---|
275 | break;
|
---|
276 | case ESCAPE_CHAR:
|
---|
277 | // Now the \u00e7 type
|
---|
278 | if(index + 1 < raw_length) {
|
---|
279 | char c3 = raw.charAt(index + 1);
|
---|
280 | if((c3 == UPPER_U_CHAR || c3 == LOWER_U_CHAR) && index + 5 < raw_length) {
|
---|
281 | // We read four digits
|
---|
282 | String hex_str = raw.substring(index + 2, index + 6);
|
---|
283 | int number = Integer.parseInt(hex_str, 16);
|
---|
284 | hex_str = null;
|
---|
285 | processed.append((char)number);
|
---|
286 | index = index + 5;
|
---|
287 | break;
|
---|
288 | }
|
---|
289 | }
|
---|
290 | processed.append(c0);
|
---|
291 | break;
|
---|
292 | default:
|
---|
293 | processed.append(c0);
|
---|
294 | }
|
---|
295 | index++;
|
---|
296 | }
|
---|
297 | return processed.toString();
|
---|
298 | }
|
---|
299 |
|
---|
300 | static public void main(String[] args) {
|
---|
301 | if(args.length < 2) {
|
---|
302 | String processed;
|
---|
303 | String raw;
|
---|
304 | String transform;
|
---|
305 |
|
---|
306 | System.err.println("Running Test Suite");
|
---|
307 |
|
---|
308 | transform = "DOM_TO_GREENSTONE";
|
---|
309 | System.err.println("Test " + transform);
|
---|
310 | raw = "A &lt;\nand a <a href="here.html"><font size='2'>URL</font></a>";
|
---|
311 | System.err.println("Raw: '" + raw + "'");
|
---|
312 | processed = transform(raw, transform);
|
---|
313 | System.err.println("Processed: '" + processed + "'");
|
---|
314 |
|
---|
315 | transform = "DOM_TO_TEXT";
|
---|
316 | System.err.println("Test " + transform);
|
---|
317 | raw = "A &lt;\nand a <a href="here.html"><font size='2'>URL</font></a>";
|
---|
318 | System.err.println("Raw: '" + raw + "'");
|
---|
319 | processed = transform(raw, transform);
|
---|
320 | System.err.println("Processed: '" + processed + "'");
|
---|
321 |
|
---|
322 | transform = "GREENSTONE_TO_DOM";
|
---|
323 | System.err.println("Test " + transform);
|
---|
324 | raw = "A <\\nand a <a href=\\\"here.html\\\"><font size=\\\'2\\\'URL</font></a>";
|
---|
325 | System.err.println("Raw: '" + raw + "'");
|
---|
326 | processed = transform(raw, transform);
|
---|
327 | System.err.println("Processed: '" + processed + "'");
|
---|
328 |
|
---|
329 | transform = "GREENSTONE_TO_TEXT";
|
---|
330 | System.err.println("Test " + transform);
|
---|
331 | raw = "These \\[ \\] should be escaped, and so should \\\\ that. These " ' \\n are encoded.";
|
---|
332 | System.err.println("Raw: '" + raw + "'");
|
---|
333 | processed = transform(raw, transform);
|
---|
334 | System.err.println("Processed: '" + processed + "'");
|
---|
335 |
|
---|
336 | transform = "TEXT_TO_DOM";
|
---|
337 | System.err.println("Test " + transform);
|
---|
338 | raw = "A <\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
|
---|
339 | System.err.println("Raw: '" + raw + "'");
|
---|
340 | processed = transform(raw, transform);
|
---|
341 | System.err.println("Processed: '" + processed + "'");
|
---|
342 |
|
---|
343 | transform = "TEXT_TO_GREENSTONE";
|
---|
344 | System.err.println("Test " + transform);
|
---|
345 | raw = "These [ ] should be escaped, and so should \\ that. These \" \' \n are encoded.";
|
---|
346 | System.err.println("Raw: '" + raw + "'");
|
---|
347 | processed = transform(raw, transform);
|
---|
348 | System.err.println("Processed: '" + processed + "'");
|
---|
349 |
|
---|
350 | transform = "TEXT_TO_SHELL";
|
---|
351 | System.err.println("Test " + transform);
|
---|
352 | if(Utility.isWindows()) {
|
---|
353 | System.err.println("[Windows Version]");
|
---|
354 | transform = "TEXT_TO_SHELL_WINDOWS";
|
---|
355 | }
|
---|
356 | else {
|
---|
357 | System.err.println("[Unix Version]");
|
---|
358 | transform = "TEXT_TO_SHELL_UNIX";
|
---|
359 | }
|
---|
360 | raw = "A <\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
|
---|
361 | System.err.println("Raw: '" + raw + "'");
|
---|
362 | processed = transform(raw, transform);
|
---|
363 | System.err.println("Processed: '" + processed + "'");
|
---|
364 |
|
---|
365 | System.err.println("***** UNICODE TEST *****");
|
---|
366 | System.err.println("\\u0030 => " + transformUnicode("\\u0030"));
|
---|
367 | System.err.println("\\u0041 => " + transformUnicode("\\u0041"));
|
---|
368 | System.err.println("\\u007a => " + transformUnicode("\\u007a"));
|
---|
369 | System.err.println("\\u00e7 => " + transformUnicode("\\u00e7"));
|
---|
370 | System.err.println("0 => " + transformUnicode("0"));
|
---|
371 | System.err.println("A => " + transformUnicode("A"));
|
---|
372 | System.err.println("z => " + transformUnicode("z"));
|
---|
373 | System.err.println("ç => " + transformUnicode("ç"));
|
---|
374 | }
|
---|
375 | else {
|
---|
376 | System.err.println("Raw: '" + args[0] + "'");
|
---|
377 | System.err.println("Transform: " + args[1]);
|
---|
378 | String processed = transform(args[0], args[1]);
|
---|
379 | System.err.println("Processed: '" + processed + "'");
|
---|
380 | }
|
---|
381 | }
|
---|
382 | }
|
---|