source: main/trunk/gli/src/org/greenstone/gatherer/util/Codec.java@ 34241

Last change on this file since 34241 was 34241, checked in by ak19, 4 years ago

The previous commit fixed the issue where HTML in collection descriptions was not being preserved when GLI was not involved. This commit fixes the remaining problems with preserving HTML in coll descriptions when GLI is involved.

  • Property svn:keywords set to Author Date Id Revision
File size: 13.4 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * Author: John Thompson, Greenstone Digital Library, University of Waikato
9 *
10 * Copyright (C) 1999 New Zealand Digital Library Project
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *########################################################################
26 */
27package org.greenstone.gatherer.util;
28
29import java.util.*;
30
31/** Provides a standard, extensible way to convert from one format of string to another (given that each format has differing requirements regarding legal characters and escaped characters)
32 * @author John Thompson, Greenstone Digital Library, University of Waikato
33 * @version 2.3d
34 */
35public class Codec {
36
37 static final public String DECODE_PATH = "DECODE_PATH";
38 static final public String DECODE_SQUARE_BRACKETS = "DECODE_SQUARE_BRACKETS";
39 static final public String DOM_TO_GREENSTONE = "DOM_TO_GREENSTONE";
40 static final public String DOM_TO_TEXT = "DOM_TO_TEXT";
41 static final public String ENCODE_PATH = "ENCODE_PATH";
42 static final public String ENCODE_SQUARE_BRACKETS = "ENCODE_SQUARE_BRACKETS";
43 static final public String ESCAPEDHTML_TO_UNESCAPED = "ESCAPEDHTML_TO_UNESCAPED";
44 static final public String GREENSTONE_TO_DOM = "GREENSTONE_TO_DOM";
45 static final public String GREENSTONE_TO_TEXT = "GREENSTONE_TO_TEXT";
46 static final public String TEXT_TO_DOM = "TEXT_TO_DOM";
47 static final public String TEXT_TO_DOM_PRESERVE_TAGS = "TEXT_TO_DOM_PRESERVE_TAGS";
48 static final public String TEXT_TO_GREENSTONE = "TEXT_TO_GREENSTONE";
49 static final public String TEXT_TO_REGEXP = "TEXT_TO_REGEXP";
50 static final public String TEXT_TO_SHELL_UNIX = "TEXT_TO_SHELL_UNIX";
51 static final public String TEXT_TO_SHELL_WINDOWS = "TEXT_TO_SHELL_WINDOWS";
52
53 static final private int MAX_CACHE_SIZE = 100;
54
55 static private HashMap TRANSFORMS;
56 static private HashMap3D CACHE;
57
58 /** Static function called to construct TRANSFORMS mappings */
59 static {
60 TRANSFORMS = new HashMap();
61
62 String[] decode_path = {
63 "\\|", "\\\\",
64 "|", "\\|"
65 };
66 TRANSFORMS.put(DECODE_PATH, decode_path);
67 decode_path = null;
68
69 // Transform text into text, but without [ and ]
70 String[] decode_square_brackets = {
71 "[", "\\[",
72 "]", "\\]"
73 };
74 TRANSFORMS.put(DECODE_SQUARE_BRACKETS, decode_square_brackets);
75 decode_square_brackets = null;
76
77 // Translate DOM encoded text into Greenstone encoding
78 String[] dom_to_greenstone = {
79 "'", "\\\\\'",
80 ">", ">",
81 "&lt;", "<",
82 "&quot;", "\\\\\"",
83 "&amp;", "&"
84 };
85 // removed "\n", "\\\\n", - config files are allowed new lines
86 // added "\\|", "\\\\"
87
88 TRANSFORMS.put(DOM_TO_GREENSTONE, dom_to_greenstone);
89 dom_to_greenstone = null;
90
91 // Transform DOM encoded text into plain text
92 String[] dom_to_text = {
93 "&amp;#091;", "\\[",
94 "&amp;#093;", "\\]",
95 "&apos;", "\'",
96 "&gt;", ">",
97 "&lt;", "<",
98 "&quot;", "\"",
99 "&amp;", "&"
100 };
101 TRANSFORMS.put(DOM_TO_TEXT, dom_to_text);
102 dom_to_text = null;
103
104 // Transform text into a regular expression that will match it
105 String[] text_to_regexp = {
106 "\\\\", "\\\\\\\\",
107 "\\(", "\\\\(",
108 "\\)", "\\\\)",
109 "\\[", "\\\\[",
110 "\\]", "\\\\]",
111 "\\{", "\\\\{",
112 "\\}", "\\\\}",
113 "\\.", "\\\\."
114 };
115 TRANSFORMS.put(TEXT_TO_REGEXP, text_to_regexp);
116 text_to_regexp = null;
117
118 String[] encode_path = {
119 "\\|", "&#124;",
120 "\\\\", "\\|"
121 };
122 TRANSFORMS.put(ENCODE_PATH, encode_path);
123 encode_path = null;
124
125 // Transform text into text, but without [ and ]
126 String[] encode_square_brackets = {
127 "\\[", "&#091;",
128 "\\]", "&#093;"
129 };
130 TRANSFORMS.put(ENCODE_SQUARE_BRACKETS, encode_square_brackets);
131 encode_square_brackets = null;
132
133 // Transform Greenstone encoded text to DOM encoding
134 String[] greenstone_to_dom = {
135 "&", "&amp;",
136 "<", "&lt;",
137 ">", "&gt;",
138 "\\\\\"", "&quot;",
139 "\\\\\'", "&apos;",
140 "\"", "&quot;",
141 "\'", "&apos;"
142 };
143 // removed"\\\\n", "\n", added "\\\\", "\\|"
144
145 TRANSFORMS.put(GREENSTONE_TO_DOM, greenstone_to_dom);
146 greenstone_to_dom = null;
147
148 // Transform Greenstone encoded text to plain text
149 String[] greenstone_to_text = {
150 "\\\\\"", "\"",
151 "\\\\\'", "\'",
152 "&quot;", "\"",
153 "&apos;", "\'",
154 "&#091;", "\\[",
155 "&#093;", "\\]"
156 };
157 // removed "\\\\n", "\n", "\\|", "\\\\"
158
159 TRANSFORMS.put(GREENSTONE_TO_TEXT, greenstone_to_text);
160 greenstone_to_text = null;
161
162 // Transform plain html text into something that can be placed in a DOM
163 String[] text_to_dom = {
164 "&", "&amp;",
165 "<", "&lt;",
166 ">", "&gt;",
167 "\"", "&quot;",
168 "\'", "&apos;"
169 };
170 TRANSFORMS.put(TEXT_TO_DOM, text_to_dom);
171 text_to_dom = null;
172
173 // Same as above, but preserve html element tags
174 String[] text_to_dom_preserve_tags = {
175 "&", "&amp;",
176 "\"", "&quot;",
177 "\'", "&apos;"
178 };
179 TRANSFORMS.put(TEXT_TO_DOM_PRESERVE_TAGS, text_to_dom_preserve_tags);
180 text_to_dom_preserve_tags = null;
181
182 // Unescape html (or xml) text
183 String[] escapedhtml_to_unescaped = {
184 "&amp;", "&",
185 "&lt;", "<",
186 "&gt;", ">",
187 "&quot;", "\""//,
188 //"&apos;", "\'"
189 };
190 TRANSFORMS.put(ESCAPEDHTML_TO_UNESCAPED, escapedhtml_to_unescaped);
191 escapedhtml_to_unescaped = null;
192
193 // Transform plain html text into greenstone encoding
194 String[] text_to_greenstone = {
195
196 "\\[", "&#091;",
197 "\\]", "&#093;",
198 "\"", "&quot;",
199 "\n", "\\\\n"
200 };
201 // "\'", "&apos;",
202 // removed "\\\\", "\\|",
203 TRANSFORMS.put(TEXT_TO_GREENSTONE, text_to_greenstone);
204 text_to_greenstone = null;
205
206 // Transform plain html text into something that can be placed in a shell command
207 String[] text_to_shell_unix = {
208 "\"", "\\\\\"",
209 "\'", "\\\\\'",
210 "\n", "\\\\n"
211 };
212 TRANSFORMS.put(TEXT_TO_SHELL_UNIX, text_to_shell_unix);
213 text_to_shell_unix = null;
214
215 // Transform plain html text into something that can be placed in a shell command. Windows requires twice as many escaped for speech marks to be passed to underlying processes
216 String[] text_to_shell_windows = {
217 "\"", "\\\\\\\\\\\\\"",
218 "\'", "\\\\\'",
219 "\n", "\\\\n"
220 };
221 TRANSFORMS.put(TEXT_TO_SHELL_WINDOWS, text_to_shell_windows);
222 text_to_shell_windows = null;
223
224 CACHE = new HashMap3D();
225 }
226
227 static public String transform(String raw, String transform) {
228 if(raw == null) {
229 return raw;
230 }
231 // System.err.println("Transforming by "+transform+":\n" + raw);
232 String processed = (String) CACHE.get(transform, raw);
233 if(processed == null) {
234 processed = raw;
235 String[] transforms = (String[]) TRANSFORMS.get(transform);
236 if(transforms != null) {
237 for(int i = 0; i < transforms.length; i = i + 2) {
238 String target = transforms[i];
239 String result = transforms[i+1];
240 processed = processed.replaceAll(target, result);
241 }
242 }
243 //DebugStream.println("\n*** Transform: " + transform + " ***");
244 //DebugStream.println("*** Raw : '" + raw + "'");
245 //DebugStream.println("*** Processed: '" + processed + "'");
246 // If cache is at maximum size, empty it and start again
247 if(CACHE.size() == MAX_CACHE_SIZE) {
248 CACHE.clear();
249 }
250 CACHE.put(transform, raw, processed);
251 }
252 return processed;
253 }
254
255 /** Transform either of the accepted unicode escape sequences styles from in the string into single characters */
256 static final private char AND_CHAR = '&';
257 static final private char ESCAPE_CHAR = '\\';
258 static final private char HASH_CHAR = '#';
259 static final private char LOWER_U_CHAR = 'u';
260 static final private char UPPER_U_CHAR = 'U';
261 static final private char SEMICOLON_CHAR = ';';
262
263 static public String transformUnicode(String raw) {
264 StringBuffer processed = new StringBuffer();
265 int index = 0;
266 int raw_length = raw.length();
267 while(index < raw_length) {
268 char c0 = raw.charAt(index);
269 switch(c0) {
270 case AND_CHAR:
271 if(index + 1 < raw_length) {
272 // First the HTML &#231; type
273 char c1 = raw.charAt(index + 1);
274 if(c1 == HASH_CHAR) {
275 StringBuffer number_str = new StringBuffer();
276 char c2;
277 int offset = 2;
278 while(index + offset < raw_length && (c2 = raw.charAt(index + offset)) != SEMICOLON_CHAR) {
279 number_str.append(c2);
280 offset++;
281 }
282 // We've either run out of characters or have parsed a number
283 if(index + offset < raw_length && raw.charAt(index + offset) == SEMICOLON_CHAR) {
284 int number = Integer.parseInt(number_str.toString());
285 processed.append((char)number);
286 index = index + offset;
287 number_str = null;
288 break;
289 }
290 number_str = null;
291 }
292 }
293 processed.append(c0);
294 break;
295 case ESCAPE_CHAR:
296 // Now the \u00e7 type
297 if(index + 1 < raw_length) {
298 char c3 = raw.charAt(index + 1);
299 if((c3 == UPPER_U_CHAR || c3 == LOWER_U_CHAR) && index + 5 < raw_length) {
300 // We read four digits
301 String hex_str = raw.substring(index + 2, index + 6);
302 int number = Integer.parseInt(hex_str, 16);
303 hex_str = null;
304 processed.append((char)number);
305 index = index + 5;
306 break;
307 }
308 }
309 processed.append(c0);
310 break;
311 default:
312 processed.append(c0);
313 }
314 index++;
315 }
316 return processed.toString();
317 }
318
319 static public void main(String[] args) {
320 if(args.length < 2) {
321 String processed;
322 String raw;
323 String transform;
324
325 System.err.println("Running Test Suite");
326
327 transform = "DOM_TO_GREENSTONE";
328 System.err.println("Test " + transform);
329 raw = "A &amp;lt;\nand a &lt;a href=&quot;here.html&quot;&gt;&lt;font size=&apos;2&apos;&gt;URL&lt;/font&gt;&lt;/a&gt;";
330 System.err.println("Raw: '" + raw + "'");
331 processed = transform(raw, transform);
332 System.err.println("Processed: '" + processed + "'");
333
334 transform = "DOM_TO_TEXT";
335 System.err.println("Test " + transform);
336 raw = "A &amp;lt;\nand a &lt;a href=&quot;here.html&quot;&gt;&lt;font size=&apos;2&apos;&gt;URL&lt;/font&gt;&lt;/a&gt;";
337 System.err.println("Raw: '" + raw + "'");
338 processed = transform(raw, transform);
339 System.err.println("Processed: '" + processed + "'");
340
341 transform = "GREENSTONE_TO_DOM";
342 System.err.println("Test " + transform);
343 raw = "A &lt;\\nand a <a href=\\\"here.html\\\"><font size=\\\'2\\\'URL</font></a>";
344 System.err.println("Raw: '" + raw + "'");
345 processed = transform(raw, transform);
346 System.err.println("Processed: '" + processed + "'");
347
348 transform = "GREENSTONE_TO_TEXT";
349 System.err.println("Test " + transform);
350 raw = "These \\[ \\] should be escaped, and so should \\\\ that. These &quot; &apos; \\n are encoded.";
351 System.err.println("Raw: '" + raw + "'");
352 processed = transform(raw, transform);
353 System.err.println("Processed: '" + processed + "'");
354
355 transform = "TEXT_TO_DOM";
356 System.err.println("Test " + transform);
357 raw = "A &lt;\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
358 System.err.println("Raw: '" + raw + "'");
359 processed = transform(raw, transform);
360 System.err.println("Processed: '" + processed + "'");
361
362 transform = "TEXT_TO_GREENSTONE";
363 System.err.println("Test " + transform);
364 raw = "These [ ] should be escaped, and so should \\ that. These \" \' \n are encoded.";
365 System.err.println("Raw: '" + raw + "'");
366 processed = transform(raw, transform);
367 System.err.println("Processed: '" + processed + "'");
368
369 transform = "TEXT_TO_SHELL";
370 System.err.println("Test " + transform);
371 if(Utility.isWindows()) {
372 System.err.println("[Windows Version]");
373 transform = "TEXT_TO_SHELL_WINDOWS";
374 }
375 else {
376 System.err.println("[Unix Version]");
377 transform = "TEXT_TO_SHELL_UNIX";
378 }
379 raw = "A &lt;\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
380 System.err.println("Raw: '" + raw + "'");
381 processed = transform(raw, transform);
382 System.err.println("Processed: '" + processed + "'");
383
384 System.err.println("***** UNICODE TEST *****");
385 System.err.println("\\u0030 => " + transformUnicode("\\u0030"));
386 System.err.println("\\u0041 => " + transformUnicode("\\u0041"));
387 System.err.println("\\u007a => " + transformUnicode("\\u007a"));
388 System.err.println("\\u00e7 => " + transformUnicode("\\u00e7"));
389 System.err.println("&#48; => " + transformUnicode("&#48;"));
390 System.err.println("&#65; => " + transformUnicode("&#65;"));
391 System.err.println("&#122; => " + transformUnicode("&#122;"));
392 System.err.println("&#231; => " + transformUnicode("&#231;"));
393 }
394 else {
395 System.err.println("Raw: '" + args[0] + "'");
396 System.err.println("Transform: " + args[1]);
397 String processed = transform(args[0], args[1]);
398 System.err.println("Processed: '" + processed + "'");
399 }
400 }
401}
Note: See TracBrowser for help on using the repository browser.