source: other-projects/FileTransfer-WebSocketPair/testGXTWithGreenstone/src/org/greenstone/gatherer/util/Codec.java@ 33053

Last change on this file since 33053 was 33053, checked in by ak19, 5 years ago

I still had some stuff of Nathan Kelly's (FileTransfer-WebSocketPair) sitting on my USB. Had already commited the Themes folder at the time, 2 years back. Not sure if he wanted this additional folder commited. But I didn't want to delete it and decided it will be better off on SVN. When we use his project, if we find we didn't need this test folder, we can remove it from svn then.

File size: 13.0 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * Author: John Thompson, Greenstone Digital Library, University of Waikato
9 *
10 * Copyright (C) 1999 New Zealand Digital Library Project
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *########################################################################
26 */
27package org.greenstone.gatherer.util;
28
29import java.util.*;
30
31/** Provides a standard, extensible way to convert from one format of string to another (given that each format has differing requirements regarding legal characters and escaped characters)
32 * @author John Thompson, Greenstone Digital Library, University of Waikato
33 * @version 2.3d
34 */
35public class Codec {
36
37 static final public String DECODE_PATH = "DECODE_PATH";
38 static final public String DECODE_SQUARE_BRACKETS = "DECODE_SQUARE_BRACKETS";
39 static final public String DOM_TO_GREENSTONE = "DOM_TO_GREENSTONE";
40 static final public String DOM_TO_TEXT = "DOM_TO_TEXT";
41 static final public String ENCODE_PATH = "ENCODE_PATH";
42 static final public String ENCODE_SQUARE_BRACKETS = "ENCODE_SQUARE_BRACKETS";
43 static final public String ESCAPEDHTML_TO_UNESCAPED = "ESCAPEDHTML_TO_UNESCAPED";
44 static final public String GREENSTONE_TO_DOM = "GREENSTONE_TO_DOM";
45 static final public String GREENSTONE_TO_TEXT = "GREENSTONE_TO_TEXT";
46 static final public String TEXT_TO_DOM = "TEXT_TO_DOM";
47 static final public String TEXT_TO_GREENSTONE = "TEXT_TO_GREENSTONE";
48 static final public String TEXT_TO_REGEXP = "TEXT_TO_REGEXP";
49 static final public String TEXT_TO_SHELL_UNIX = "TEXT_TO_SHELL_UNIX";
50 static final public String TEXT_TO_SHELL_WINDOWS = "TEXT_TO_SHELL_WINDOWS";
51
52 static final private int MAX_CACHE_SIZE = 100;
53
54 static private HashMap TRANSFORMS;
55 static private HashMap3D CACHE;
56
57 /** Static function called to construct TRANSFORMS mappings */
58 static {
59 TRANSFORMS = new HashMap();
60
61 String[] decode_path = {
62 "\\|", "\\\\",
63 "|", "\\|"
64 };
65 TRANSFORMS.put(DECODE_PATH, decode_path);
66 decode_path = null;
67
68 // Transform text into text, but without [ and ]
69 String[] decode_square_brackets = {
70 "[", "\\[",
71 "]", "\\]"
72 };
73 TRANSFORMS.put(DECODE_SQUARE_BRACKETS, decode_square_brackets);
74 decode_square_brackets = null;
75
76 // Translate DOM encoded text into Greenstone encoding
77 String[] dom_to_greenstone = {
78 "'", "\\\\\'",
79 ">", ">",
80 "&lt;", "<",
81 "&quot;", "\\\\\"",
82 "&amp;", "&"
83 };
84 // removed "\n", "\\\\n", - config files are allowed new lines
85 // added "\\|", "\\\\"
86
87 TRANSFORMS.put(DOM_TO_GREENSTONE, dom_to_greenstone);
88 dom_to_greenstone = null;
89
90 // Transform DOM encoded text into plain text
91 String[] dom_to_text = {
92 "&amp;#091;", "\\[",
93 "&amp;#093;", "\\]",
94 "&apos;", "\'",
95 "&gt;", ">",
96 "&lt;", "<",
97 "&quot;", "\"",
98 "&amp;", "&"
99 };
100 TRANSFORMS.put(DOM_TO_TEXT, dom_to_text);
101 dom_to_text = null;
102
103 // Transform text into a regular expression that will match it
104 String[] text_to_regexp = {
105 "\\\\", "\\\\\\\\",
106 "\\(", "\\\\(",
107 "\\)", "\\\\)",
108 "\\[", "\\\\[",
109 "\\]", "\\\\]",
110 "\\{", "\\\\{",
111 "\\}", "\\\\}",
112 "\\.", "\\\\."
113 };
114 TRANSFORMS.put(TEXT_TO_REGEXP, text_to_regexp);
115 text_to_regexp = null;
116
117 String[] encode_path = {
118 "\\|", "&#124;",
119 "\\\\", "\\|"
120 };
121 TRANSFORMS.put(ENCODE_PATH, encode_path);
122 encode_path = null;
123
124 // Transform text into text, but without [ and ]
125 String[] encode_square_brackets = {
126 "\\[", "&#091;",
127 "\\]", "&#093;"
128 };
129 TRANSFORMS.put(ENCODE_SQUARE_BRACKETS, encode_square_brackets);
130 encode_square_brackets = null;
131
132 // Transform Greenstone encoded text to DOM encoding
133 String[] greenstone_to_dom = {
134 "&", "&amp;",
135 "<", "&lt;",
136 ">", "&gt;",
137 "\\\\\"", "&quot;",
138 "\\\\\'", "&apos;",
139 "\"", "&quot;",
140 "\'", "&apos;"
141 };
142 // removed"\\\\n", "\n", added "\\\\", "\\|"
143
144 TRANSFORMS.put(GREENSTONE_TO_DOM, greenstone_to_dom);
145 greenstone_to_dom = null;
146
147 // Transform Greenstone encoded text to plain text
148 String[] greenstone_to_text = {
149 "\\\\\"", "\"",
150 "\\\\\'", "\'",
151 "&quot;", "\"",
152 "&apos;", "\'",
153 "&#091;", "\\[",
154 "&#093;", "\\]"
155 };
156 // removed "\\\\n", "\n", "\\|", "\\\\"
157
158 TRANSFORMS.put(GREENSTONE_TO_TEXT, greenstone_to_text);
159 greenstone_to_text = null;
160
161 // Transform plain html text into something that can be placed in a DOM
162 String[] text_to_dom = {
163 "&", "&amp;",
164 "<", "&lt;",
165 ">", "&gt;",
166 "\"", "&quot;",
167 "\'", "&apos;"
168 };
169 TRANSFORMS.put(TEXT_TO_DOM, text_to_dom);
170 text_to_dom = null;
171
172 // Unescape html (or xml) text
173 String[] escapedhtml_to_unescaped = {
174 "&amp;", "&",
175 "&lt;", "<",
176 "&gt;", ">",
177 "&quot;", "\""//,
178 //"&apos;", "\'"
179 };
180 TRANSFORMS.put(ESCAPEDHTML_TO_UNESCAPED, escapedhtml_to_unescaped);
181 escapedhtml_to_unescaped = null;
182
183 // Transform plain html text into greenstone encoding
184 String[] text_to_greenstone = {
185
186 "\\[", "&#091;",
187 "\\]", "&#093;",
188 "\"", "&quot;",
189 "\n", "\\\\n"
190 };
191 // "\'", "&apos;",
192 // removed "\\\\", "\\|",
193 TRANSFORMS.put(TEXT_TO_GREENSTONE, text_to_greenstone);
194 text_to_greenstone = null;
195
196 // Transform plain html text into something that can be placed in a shell command
197 String[] text_to_shell_unix = {
198 "\"", "\\\\\"",
199 "\'", "\\\\\'",
200 "\n", "\\\\n"
201 };
202 TRANSFORMS.put(TEXT_TO_SHELL_UNIX, text_to_shell_unix);
203 text_to_shell_unix = null;
204
205 // Transform plain html text into something that can be placed in a shell command. Windows requires twice as many escaped for speech marks to be passed to underlying processes
206 String[] text_to_shell_windows = {
207 "\"", "\\\\\\\\\\\\\"",
208 "\'", "\\\\\'",
209 "\n", "\\\\n"
210 };
211 TRANSFORMS.put(TEXT_TO_SHELL_WINDOWS, text_to_shell_windows);
212 text_to_shell_windows = null;
213
214 CACHE = new HashMap3D();
215 }
216
217 static public String transform(String raw, String transform) {
218 if(raw == null) {
219 return raw;
220 }
221 // System.err.println("Transforming by "+transform+":\n" + raw);
222 String processed = (String) CACHE.get(transform, raw);
223 if(processed == null) {
224 processed = raw;
225 String[] transforms = (String[]) TRANSFORMS.get(transform);
226 if(transforms != null) {
227 for(int i = 0; i < transforms.length; i = i + 2) {
228 String target = transforms[i];
229 String result = transforms[i+1];
230 processed = processed.replaceAll(target, result);
231 }
232 }
233 //DebugStream.println("\n*** Transform: " + transform + " ***");
234 //DebugStream.println("*** Raw : '" + raw + "'");
235 //DebugStream.println("*** Processed: '" + processed + "'");
236 // If cache is at maximum size, empty it and start again
237 if(CACHE.size() == MAX_CACHE_SIZE) {
238 CACHE.clear();
239 }
240 CACHE.put(transform, raw, processed);
241 }
242 return processed;
243 }
244
245 /** Transform either of the accepted unicode escape sequences styles from in the string into single characters */
246 static final private char AND_CHAR = '&';
247 static final private char ESCAPE_CHAR = '\\';
248 static final private char HASH_CHAR = '#';
249 static final private char LOWER_U_CHAR = 'u';
250 static final private char UPPER_U_CHAR = 'U';
251 static final private char SEMICOLON_CHAR = ';';
252
253 static public String transformUnicode(String raw) {
254 StringBuffer processed = new StringBuffer();
255 int index = 0;
256 int raw_length = raw.length();
257 while(index < raw_length) {
258 char c0 = raw.charAt(index);
259 switch(c0) {
260 case AND_CHAR:
261 if(index + 1 < raw_length) {
262 // First the HTML &#231; type
263 char c1 = raw.charAt(index + 1);
264 if(c1 == HASH_CHAR) {
265 StringBuffer number_str = new StringBuffer();
266 char c2;
267 int offset = 2;
268 while(index + offset < raw_length && (c2 = raw.charAt(index + offset)) != SEMICOLON_CHAR) {
269 number_str.append(c2);
270 offset++;
271 }
272 // We've either run out of characters or have parsed a number
273 if(index + offset < raw_length && raw.charAt(index + offset) == SEMICOLON_CHAR) {
274 int number = Integer.parseInt(number_str.toString());
275 processed.append((char)number);
276 index = index + offset;
277 number_str = null;
278 break;
279 }
280 number_str = null;
281 }
282 }
283 processed.append(c0);
284 break;
285 case ESCAPE_CHAR:
286 // Now the \u00e7 type
287 if(index + 1 < raw_length) {
288 char c3 = raw.charAt(index + 1);
289 if((c3 == UPPER_U_CHAR || c3 == LOWER_U_CHAR) && index + 5 < raw_length) {
290 // We read four digits
291 String hex_str = raw.substring(index + 2, index + 6);
292 int number = Integer.parseInt(hex_str, 16);
293 hex_str = null;
294 processed.append((char)number);
295 index = index + 5;
296 break;
297 }
298 }
299 processed.append(c0);
300 break;
301 default:
302 processed.append(c0);
303 }
304 index++;
305 }
306 return processed.toString();
307 }
308
309 static public void main(String[] args) {
310 if(args.length < 2) {
311 String processed;
312 String raw;
313 String transform;
314
315 System.err.println("Running Test Suite");
316
317 transform = "DOM_TO_GREENSTONE";
318 System.err.println("Test " + transform);
319 raw = "A &amp;lt;\nand a &lt;a href=&quot;here.html&quot;&gt;&lt;font size=&apos;2&apos;&gt;URL&lt;/font&gt;&lt;/a&gt;";
320 System.err.println("Raw: '" + raw + "'");
321 processed = transform(raw, transform);
322 System.err.println("Processed: '" + processed + "'");
323
324 transform = "DOM_TO_TEXT";
325 System.err.println("Test " + transform);
326 raw = "A &amp;lt;\nand a &lt;a href=&quot;here.html&quot;&gt;&lt;font size=&apos;2&apos;&gt;URL&lt;/font&gt;&lt;/a&gt;";
327 System.err.println("Raw: '" + raw + "'");
328 processed = transform(raw, transform);
329 System.err.println("Processed: '" + processed + "'");
330
331 transform = "GREENSTONE_TO_DOM";
332 System.err.println("Test " + transform);
333 raw = "A &lt;\\nand a <a href=\\\"here.html\\\"><font size=\\\'2\\\'URL</font></a>";
334 System.err.println("Raw: '" + raw + "'");
335 processed = transform(raw, transform);
336 System.err.println("Processed: '" + processed + "'");
337
338 transform = "GREENSTONE_TO_TEXT";
339 System.err.println("Test " + transform);
340 raw = "These \\[ \\] should be escaped, and so should \\\\ that. These &quot; &apos; \\n are encoded.";
341 System.err.println("Raw: '" + raw + "'");
342 processed = transform(raw, transform);
343 System.err.println("Processed: '" + processed + "'");
344
345 transform = "TEXT_TO_DOM";
346 System.err.println("Test " + transform);
347 raw = "A &lt;\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
348 System.err.println("Raw: '" + raw + "'");
349 processed = transform(raw, transform);
350 System.err.println("Processed: '" + processed + "'");
351
352 transform = "TEXT_TO_GREENSTONE";
353 System.err.println("Test " + transform);
354 raw = "These [ ] should be escaped, and so should \\ that. These \" \' \n are encoded.";
355 System.err.println("Raw: '" + raw + "'");
356 processed = transform(raw, transform);
357 System.err.println("Processed: '" + processed + "'");
358
359 transform = "TEXT_TO_SHELL";
360 System.err.println("Test " + transform);
361 if(Utility.isWindows()) {
362 System.err.println("[Windows Version]");
363 transform = "TEXT_TO_SHELL_WINDOWS";
364 }
365 else {
366 System.err.println("[Unix Version]");
367 transform = "TEXT_TO_SHELL_UNIX";
368 }
369 raw = "A &lt;\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
370 System.err.println("Raw: '" + raw + "'");
371 processed = transform(raw, transform);
372 System.err.println("Processed: '" + processed + "'");
373
374 System.err.println("***** UNICODE TEST *****");
375 System.err.println("\\u0030 => " + transformUnicode("\\u0030"));
376 System.err.println("\\u0041 => " + transformUnicode("\\u0041"));
377 System.err.println("\\u007a => " + transformUnicode("\\u007a"));
378 System.err.println("\\u00e7 => " + transformUnicode("\\u00e7"));
379 System.err.println("&#48; => " + transformUnicode("&#48;"));
380 System.err.println("&#65; => " + transformUnicode("&#65;"));
381 System.err.println("&#122; => " + transformUnicode("&#122;"));
382 System.err.println("&#231; => " + transformUnicode("&#231;"));
383 }
384 else {
385 System.err.println("Raw: '" + args[0] + "'");
386 System.err.println("Transform: " + args[1]);
387 String processed = transform(args[0], args[1]);
388 System.err.println("Processed: '" + processed + "'");
389 }
390 }
391}
Note: See TracBrowser for help on using the repository browser.