source: main/trunk/gli/src/org/greenstone/gatherer/util/Codec.java@ 34246

Last change on this file since 34246 was 34246, checked in by ak19, 4 years ago

Still part of commit 34241 and now also 34245.

  • Property svn:keywords set to Author Date Id Revision
File size: 13.7 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * Author: John Thompson, Greenstone Digital Library, University of Waikato
9 *
10 * Copyright (C) 1999 New Zealand Digital Library Project
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *########################################################################
26 */
27package org.greenstone.gatherer.util;
28
29import java.util.*;
30
31/** Provides a standard, extensible way to convert from one format of string to another (given that each format has differing requirements regarding legal characters and escaped characters)
32 * @author John Thompson, Greenstone Digital Library, University of Waikato
33 * @version 2.3d
34 */
35public class Codec {
36
37 static final public String DECODE_PATH = "DECODE_PATH";
38 static final public String DECODE_SQUARE_BRACKETS = "DECODE_SQUARE_BRACKETS";
39 static final public String DOM_TO_GREENSTONE = "DOM_TO_GREENSTONE";
40 static final public String DOM_TO_TEXT = "DOM_TO_TEXT";
41 static final public String ENCODE_PATH = "ENCODE_PATH";
42 static final public String ENCODE_SQUARE_BRACKETS = "ENCODE_SQUARE_BRACKETS";
43 static final public String ESCAPEDHTML_TO_UNESCAPED = "ESCAPEDHTML_TO_UNESCAPED";
44 static final public String REINSTATE_HTML_TAGS = "REINSTATE_HTML_TAGS";
45 static final public String GREENSTONE_TO_DOM = "GREENSTONE_TO_DOM";
46 static final public String GREENSTONE_TO_TEXT = "GREENSTONE_TO_TEXT";
47 static final public String TEXT_TO_DOM = "TEXT_TO_DOM";
48 static final public String TEXT_TO_DOM_PRESERVE_TAGS = "TEXT_TO_DOM_PRESERVE_TAGS";
49 static final public String TEXT_TO_GREENSTONE = "TEXT_TO_GREENSTONE";
50 static final public String TEXT_TO_REGEXP = "TEXT_TO_REGEXP";
51 static final public String TEXT_TO_SHELL_UNIX = "TEXT_TO_SHELL_UNIX";
52 static final public String TEXT_TO_SHELL_WINDOWS = "TEXT_TO_SHELL_WINDOWS";
53
54 static final private int MAX_CACHE_SIZE = 100;
55
56 static private HashMap TRANSFORMS;
57 static private HashMap3D CACHE;
58
59 /** Static function called to construct TRANSFORMS mappings */
60 static {
61 TRANSFORMS = new HashMap();
62
63 String[] decode_path = {
64 "\\|", "\\\\",
65 "|", "\\|"
66 };
67 TRANSFORMS.put(DECODE_PATH, decode_path);
68 decode_path = null;
69
70 // Transform text into text, but without [ and ]
71 String[] decode_square_brackets = {
72 "[", "\\[",
73 "]", "\\]"
74 };
75 TRANSFORMS.put(DECODE_SQUARE_BRACKETS, decode_square_brackets);
76 decode_square_brackets = null;
77
78 // Translate DOM encoded text into Greenstone encoding
79 String[] dom_to_greenstone = {
80 "'", "\\\\\'",
81 ">", ">",
82 "&lt;", "<",
83 "&quot;", "\\\\\"",
84 "&amp;", "&"
85 };
86 // removed "\n", "\\\\n", - config files are allowed new lines
87 // added "\\|", "\\\\"
88
89 TRANSFORMS.put(DOM_TO_GREENSTONE, dom_to_greenstone);
90 dom_to_greenstone = null;
91
92 // Transform DOM encoded text into plain text
93 String[] dom_to_text = {
94 "&amp;#091;", "\\[",
95 "&amp;#093;", "\\]",
96 "&apos;", "\'",
97 "&gt;", ">",
98 "&lt;", "<",
99 "&quot;", "\"",
100 "&amp;", "&"
101 };
102 TRANSFORMS.put(DOM_TO_TEXT, dom_to_text);
103 dom_to_text = null;
104
105 // Transform text into a regular expression that will match it
106 String[] text_to_regexp = {
107 "\\\\", "\\\\\\\\",
108 "\\(", "\\\\(",
109 "\\)", "\\\\)",
110 "\\[", "\\\\[",
111 "\\]", "\\\\]",
112 "\\{", "\\\\{",
113 "\\}", "\\\\}",
114 "\\.", "\\\\."
115 };
116 TRANSFORMS.put(TEXT_TO_REGEXP, text_to_regexp);
117 text_to_regexp = null;
118
119 String[] encode_path = {
120 "\\|", "&#124;",
121 "\\\\", "\\|"
122 };
123 TRANSFORMS.put(ENCODE_PATH, encode_path);
124 encode_path = null;
125
126 // Transform text into text, but without [ and ]
127 String[] encode_square_brackets = {
128 "\\[", "&#091;",
129 "\\]", "&#093;"
130 };
131 TRANSFORMS.put(ENCODE_SQUARE_BRACKETS, encode_square_brackets);
132 encode_square_brackets = null;
133
134 // Transform Greenstone encoded text to DOM encoding
135 String[] greenstone_to_dom = {
136 "&", "&amp;",
137 "<", "&lt;",
138 ">", "&gt;",
139 "\\\\\"", "&quot;",
140 "\\\\\'", "&apos;",
141 "\"", "&quot;",
142 "\'", "&apos;"
143 };
144 // removed"\\\\n", "\n", added "\\\\", "\\|"
145
146 TRANSFORMS.put(GREENSTONE_TO_DOM, greenstone_to_dom);
147 greenstone_to_dom = null;
148
149 // Transform Greenstone encoded text to plain text
150 String[] greenstone_to_text = {
151 "\\\\\"", "\"",
152 "\\\\\'", "\'",
153 "&quot;", "\"",
154 "&apos;", "\'",
155 "&#091;", "\\[",
156 "&#093;", "\\]"
157 };
158 // removed "\\\\n", "\n", "\\|", "\\\\"
159
160 TRANSFORMS.put(GREENSTONE_TO_TEXT, greenstone_to_text);
161 greenstone_to_text = null;
162
163 // Transform plain html text into something that can be placed in a DOM
164 String[] text_to_dom = {
165 "&", "&amp;",
166 "<", "&lt;",
167 ">", "&gt;",
168 "\"", "&quot;",
169 "\'", "&apos;"
170 };
171 TRANSFORMS.put(TEXT_TO_DOM, text_to_dom);
172 text_to_dom = null;
173
174 // Same as above, but preserve html element tags
175 String[] text_to_dom_preserve_tags = {
176 "&", "&amp;",
177 "\"", "&quot;",
178 //"\'", "&apos;"
179 };
180 TRANSFORMS.put(TEXT_TO_DOM_PRESERVE_TAGS, text_to_dom_preserve_tags);
181 text_to_dom_preserve_tags = null;
182
183 // Unescape html (or xml) text
184 String[] escapedhtml_to_unescaped = {
185 "&amp;", "&",
186 "&lt;", "<",
187 "&gt;", ">",
188 "&quot;", "\""//,
189 //"&apos;", "\'"
190 };
191 TRANSFORMS.put(ESCAPEDHTML_TO_UNESCAPED, escapedhtml_to_unescaped);
192 escapedhtml_to_unescaped = null;
193
194 // Reinstate tag markers <>
195 String[] reinstate_html_tags = {
196 "&lt;", "<",
197 "&gt;", ">",
198 };
199 TRANSFORMS.put(REINSTATE_HTML_TAGS, reinstate_html_tags);
200 reinstate_html_tags = null;
201
202
203 // Transform plain html text into greenstone encoding
204 String[] text_to_greenstone = {
205
206 "\\[", "&#091;",
207 "\\]", "&#093;",
208 "\"", "&quot;",
209 "\n", "\\\\n"
210 };
211 // "\'", "&apos;",
212 // removed "\\\\", "\\|",
213 TRANSFORMS.put(TEXT_TO_GREENSTONE, text_to_greenstone);
214 text_to_greenstone = null;
215
216 // Transform plain html text into something that can be placed in a shell command
217 String[] text_to_shell_unix = {
218 "\"", "\\\\\"",
219 "\'", "\\\\\'",
220 "\n", "\\\\n"
221 };
222 TRANSFORMS.put(TEXT_TO_SHELL_UNIX, text_to_shell_unix);
223 text_to_shell_unix = null;
224
225 // Transform plain html text into something that can be placed in a shell command. Windows requires twice as many escaped for speech marks to be passed to underlying processes
226 String[] text_to_shell_windows = {
227 "\"", "\\\\\\\\\\\\\"",
228 "\'", "\\\\\'",
229 "\n", "\\\\n"
230 };
231 TRANSFORMS.put(TEXT_TO_SHELL_WINDOWS, text_to_shell_windows);
232 text_to_shell_windows = null;
233
234 CACHE = new HashMap3D();
235 }
236
237 static public String transform(String raw, String transform) {
238 if(raw == null) {
239 return raw;
240 }
241 // System.err.println("Transforming by "+transform+":\n" + raw);
242 String processed = (String) CACHE.get(transform, raw);
243 if(processed == null) {
244 processed = raw;
245 String[] transforms = (String[]) TRANSFORMS.get(transform);
246 if(transforms != null) {
247 for(int i = 0; i < transforms.length; i = i + 2) {
248 String target = transforms[i];
249 String result = transforms[i+1];
250 processed = processed.replaceAll(target, result);
251 }
252 }
253 //DebugStream.println("\n*** Transform: " + transform + " ***");
254 //DebugStream.println("*** Raw : '" + raw + "'");
255 //DebugStream.println("*** Processed: '" + processed + "'");
256 // If cache is at maximum size, empty it and start again
257 if(CACHE.size() == MAX_CACHE_SIZE) {
258 CACHE.clear();
259 }
260 CACHE.put(transform, raw, processed);
261 }
262 return processed;
263 }
264
265 /** Transform either of the accepted unicode escape sequences styles from in the string into single characters */
266 static final private char AND_CHAR = '&';
267 static final private char ESCAPE_CHAR = '\\';
268 static final private char HASH_CHAR = '#';
269 static final private char LOWER_U_CHAR = 'u';
270 static final private char UPPER_U_CHAR = 'U';
271 static final private char SEMICOLON_CHAR = ';';
272
273 static public String transformUnicode(String raw) {
274 StringBuffer processed = new StringBuffer();
275 int index = 0;
276 int raw_length = raw.length();
277 while(index < raw_length) {
278 char c0 = raw.charAt(index);
279 switch(c0) {
280 case AND_CHAR:
281 if(index + 1 < raw_length) {
282 // First the HTML &#231; type
283 char c1 = raw.charAt(index + 1);
284 if(c1 == HASH_CHAR) {
285 StringBuffer number_str = new StringBuffer();
286 char c2;
287 int offset = 2;
288 while(index + offset < raw_length && (c2 = raw.charAt(index + offset)) != SEMICOLON_CHAR) {
289 number_str.append(c2);
290 offset++;
291 }
292 // We've either run out of characters or have parsed a number
293 if(index + offset < raw_length && raw.charAt(index + offset) == SEMICOLON_CHAR) {
294 int number = Integer.parseInt(number_str.toString());
295 processed.append((char)number);
296 index = index + offset;
297 number_str = null;
298 break;
299 }
300 number_str = null;
301 }
302 }
303 processed.append(c0);
304 break;
305 case ESCAPE_CHAR:
306 // Now the \u00e7 type
307 if(index + 1 < raw_length) {
308 char c3 = raw.charAt(index + 1);
309 if((c3 == UPPER_U_CHAR || c3 == LOWER_U_CHAR) && index + 5 < raw_length) {
310 // We read four digits
311 String hex_str = raw.substring(index + 2, index + 6);
312 int number = Integer.parseInt(hex_str, 16);
313 hex_str = null;
314 processed.append((char)number);
315 index = index + 5;
316 break;
317 }
318 }
319 processed.append(c0);
320 break;
321 default:
322 processed.append(c0);
323 }
324 index++;
325 }
326 return processed.toString();
327 }
328
329 static public void main(String[] args) {
330 if(args.length < 2) {
331 String processed;
332 String raw;
333 String transform;
334
335 System.err.println("Running Test Suite");
336
337 transform = "DOM_TO_GREENSTONE";
338 System.err.println("Test " + transform);
339 raw = "A &amp;lt;\nand a &lt;a href=&quot;here.html&quot;&gt;&lt;font size=&apos;2&apos;&gt;URL&lt;/font&gt;&lt;/a&gt;";
340 System.err.println("Raw: '" + raw + "'");
341 processed = transform(raw, transform);
342 System.err.println("Processed: '" + processed + "'");
343
344 transform = "DOM_TO_TEXT";
345 System.err.println("Test " + transform);
346 raw = "A &amp;lt;\nand a &lt;a href=&quot;here.html&quot;&gt;&lt;font size=&apos;2&apos;&gt;URL&lt;/font&gt;&lt;/a&gt;";
347 System.err.println("Raw: '" + raw + "'");
348 processed = transform(raw, transform);
349 System.err.println("Processed: '" + processed + "'");
350
351 transform = "GREENSTONE_TO_DOM";
352 System.err.println("Test " + transform);
353 raw = "A &lt;\\nand a <a href=\\\"here.html\\\"><font size=\\\'2\\\'URL</font></a>";
354 System.err.println("Raw: '" + raw + "'");
355 processed = transform(raw, transform);
356 System.err.println("Processed: '" + processed + "'");
357
358 transform = "GREENSTONE_TO_TEXT";
359 System.err.println("Test " + transform);
360 raw = "These \\[ \\] should be escaped, and so should \\\\ that. These &quot; &apos; \\n are encoded.";
361 System.err.println("Raw: '" + raw + "'");
362 processed = transform(raw, transform);
363 System.err.println("Processed: '" + processed + "'");
364
365 transform = "TEXT_TO_DOM";
366 System.err.println("Test " + transform);
367 raw = "A &lt;\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
368 System.err.println("Raw: '" + raw + "'");
369 processed = transform(raw, transform);
370 System.err.println("Processed: '" + processed + "'");
371
372 transform = "TEXT_TO_GREENSTONE";
373 System.err.println("Test " + transform);
374 raw = "These [ ] should be escaped, and so should \\ that. These \" \' \n are encoded.";
375 System.err.println("Raw: '" + raw + "'");
376 processed = transform(raw, transform);
377 System.err.println("Processed: '" + processed + "'");
378
379 transform = "TEXT_TO_SHELL";
380 System.err.println("Test " + transform);
381 if(Utility.isWindows()) {
382 System.err.println("[Windows Version]");
383 transform = "TEXT_TO_SHELL_WINDOWS";
384 }
385 else {
386 System.err.println("[Unix Version]");
387 transform = "TEXT_TO_SHELL_UNIX";
388 }
389 raw = "A &lt;\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
390 System.err.println("Raw: '" + raw + "'");
391 processed = transform(raw, transform);
392 System.err.println("Processed: '" + processed + "'");
393
394 System.err.println("***** UNICODE TEST *****");
395 System.err.println("\\u0030 => " + transformUnicode("\\u0030"));
396 System.err.println("\\u0041 => " + transformUnicode("\\u0041"));
397 System.err.println("\\u007a => " + transformUnicode("\\u007a"));
398 System.err.println("\\u00e7 => " + transformUnicode("\\u00e7"));
399 System.err.println("&#48; => " + transformUnicode("&#48;"));
400 System.err.println("&#65; => " + transformUnicode("&#65;"));
401 System.err.println("&#122; => " + transformUnicode("&#122;"));
402 System.err.println("&#231; => " + transformUnicode("&#231;"));
403 }
404 else {
405 System.err.println("Raw: '" + args[0] + "'");
406 System.err.println("Transform: " + args[1]);
407 String processed = transform(args[0], args[1]);
408 System.err.println("Processed: '" + processed + "'");
409 }
410 }
411}
Note: See TracBrowser for help on using the repository browser.