source: main/trunk/gli/src/org/greenstone/gatherer/util/Codec.java@ 22605

Last change on this file since 22605 was 8243, checked in by mdewsnip, 20 years ago

Removed all occurrences of classes explicitly importing other classes in the same package.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.7 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * Author: John Thompson, Greenstone Digital Library, University of Waikato
9 *
10 * Copyright (C) 1999 New Zealand Digital Library Project
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *########################################################################
26 */
27package org.greenstone.gatherer.util;
28
29import java.util.*;
30
31/** Provides a standard, extensible way to convert from one format of string to another (given that each format has differing requirements regarding legal characters and escaped characters)
32 * @author John Thompson, Greenstone Digital Library, University of Waikato
33 * @version 2.3d
34 */
35public class Codec {
36
37 static final public String DECODE_PATH = "DECODE_PATH";
38 static final public String DECODE_SQUARE_BRACKETS = "DECODE_SQUARE_BRACKETS";
39 static final public String DOM_TO_GREENSTONE = "DOM_TO_GREENSTONE";
40 static final public String DOM_TO_TEXT = "DOM_TO_TEXT";
41 static final public String ENCODE_PATH = "ENCODE_PATH";
42 static final public String ENCODE_SQUARE_BRACKETS = "ENCODE_SQUARE_BRACKETS";
43 static final public String GREENSTONE_TO_DOM = "GREENSTONE_TO_DOM";
44 static final public String GREENSTONE_TO_TEXT = "GREENSTONE_TO_TEXT";
45 static final public String TEXT_TO_DOM = "TEXT_TO_DOM";
46 static final public String TEXT_TO_GREENSTONE = "TEXT_TO_GREENSTONE";
47 static final public String TEXT_TO_REGEXP = "TEXT_TO_REGEXP";
48 static final public String TEXT_TO_SHELL_UNIX = "TEXT_TO_SHELL_UNIX";
49 static final public String TEXT_TO_SHELL_WINDOWS = "TEXT_TO_SHELL_WINDOWS";
50
51 static final private int MAX_CACHE_SIZE = 100;
52
53 static private HashMap TRANSFORMS;
54 static private HashMap3D CACHE;
55
56 /** Static function called to construct TRANSFORMS mappings */
57 static {
58 TRANSFORMS = new HashMap();
59
60 String[] decode_path = {
61 "\\|", "\\\\",
62 "|", "\\|"
63 };
64 TRANSFORMS.put(DECODE_PATH, decode_path);
65 decode_path = null;
66
67 // Transform text into text, but without [ and ]
68 String[] decode_square_brackets = {
69 "[", "\\[",
70 "]", "\\]"
71 };
72 TRANSFORMS.put(DECODE_SQUARE_BRACKETS, decode_square_brackets);
73 decode_square_brackets = null;
74
75 // Translate DOM encoded text into Greenstone encoding
76 String[] dom_to_greenstone = {
77 "'", "\\\\\'",
78 ">", ">",
79 "&lt;", "<",
80 "&quot;", "\\\\\"",
81 "&amp;", "&"
82 };
83 // removed "\n", "\\\\n", - config files are allowed new lines
84 // added "\\|", "\\\\"
85
86 TRANSFORMS.put(DOM_TO_GREENSTONE, dom_to_greenstone);
87 dom_to_greenstone = null;
88
89 // Transform DOM encoded text into plain text
90 String[] dom_to_text = {
91 "&amp;#091;", "\\[",
92 "&amp;#093;", "\\]",
93 "&apos;", "\'",
94 "&gt;", ">",
95 "&lt;", "<",
96 "&quot;", "\"",
97 "&amp;", "&"
98 };
99 TRANSFORMS.put(DOM_TO_TEXT, dom_to_text);
100 dom_to_text = null;
101
102 // Transform text into a regular expression that will match it
103 String[] text_to_regexp = {
104 "\\\\", "\\\\\\\\",
105 "\\(", "\\\\(",
106 "\\)", "\\\\)",
107 "\\[", "\\\\[",
108 "\\]", "\\\\]",
109 "\\{", "\\\\{",
110 "\\}", "\\\\}",
111 "\\.", "\\\\."
112 };
113 TRANSFORMS.put(TEXT_TO_REGEXP, text_to_regexp);
114 text_to_regexp = null;
115
116 String[] encode_path = {
117 "\\|", "&#124;",
118 "\\\\", "\\|"
119 };
120 TRANSFORMS.put(ENCODE_PATH, encode_path);
121 encode_path = null;
122
123 // Transform text into text, but without [ and ]
124 String[] encode_square_brackets = {
125 "\\[", "&#091;",
126 "\\]", "&#093;"
127 };
128 TRANSFORMS.put(ENCODE_SQUARE_BRACKETS, encode_square_brackets);
129 encode_square_brackets = null;
130
131 // Transform Greenstone encoded text to DOM encoding
132 String[] greenstone_to_dom = {
133 "&", "&amp;",
134 "<", "&lt;",
135 ">", "&gt;",
136 "\\\\\"", "&quot;",
137 "\\\\\'", "&apos;",
138 "\"", "&quot;",
139 "\'", "&apos;"
140 };
141 // removed"\\\\n", "\n", added "\\\\", "\\|"
142
143 TRANSFORMS.put(GREENSTONE_TO_DOM, greenstone_to_dom);
144 greenstone_to_dom = null;
145
146 // Transform Greenstone encoded text to plain text
147 String[] greenstone_to_text = {
148 "\\\\\"", "\"",
149 "\\\\\'", "\'",
150 "&quot;", "\"",
151 "&apos;", "\'",
152 "&#091;", "\\[",
153 "&#093;", "\\]"
154 };
155 // removed "\\\\n", "\n", "\\|", "\\\\"
156
157 TRANSFORMS.put(GREENSTONE_TO_TEXT, greenstone_to_text);
158 greenstone_to_text = null;
159
160 // Transform plain html text into something that can be placed in a DOM
161 String[] text_to_dom = {
162 "&", "&amp;",
163 "<", "&lt;",
164 ">", "&gt;",
165 "\"", "&quot;",
166 "\'", "&apos;"
167 };
168 TRANSFORMS.put(TEXT_TO_DOM, text_to_dom);
169 text_to_dom = null;
170
171 // Transform plain html text into greenstone encoding
172 String[] text_to_greenstone = {
173
174 "\\[", "&#091;",
175 "\\]", "&#093;",
176 "\"", "&quot;",
177 "\n", "\\\\n"
178 };
179 // "\'", "&apos;",
180 // removed "\\\\", "\\|",
181 TRANSFORMS.put(TEXT_TO_GREENSTONE, text_to_greenstone);
182 text_to_greenstone = null;
183
184 // Transform plain html text into something that can be placed in a shell command
185 String[] text_to_shell_unix = {
186 "\"", "\\\\\"",
187 "\'", "\\\\\'",
188 "\n", "\\\\n"
189 };
190 TRANSFORMS.put(TEXT_TO_SHELL_UNIX, text_to_shell_unix);
191 text_to_shell_unix = null;
192
193 // Transform plain html text into something that can be placed in a shell command. Windows requires twice as many escaped for speech marks to be passed to underlying processes
194 String[] text_to_shell_windows = {
195 "\"", "\\\\\\\\\\\\\"",
196 "\'", "\\\\\'",
197 "\n", "\\\\n"
198 };
199 TRANSFORMS.put(TEXT_TO_SHELL_WINDOWS, text_to_shell_windows);
200 text_to_shell_windows = null;
201
202 CACHE = new HashMap3D();
203 }
204
205 static public String transform(String raw, String transform) {
206 if(raw == null) {
207 return raw;
208 }
209 // System.err.println("Transforming by "+transform+":\n" + raw);
210 String processed = (String) CACHE.get(transform, raw);
211 if(processed == null) {
212 processed = raw;
213 String[] transforms = (String[]) TRANSFORMS.get(transform);
214 if(transforms != null) {
215 for(int i = 0; i < transforms.length; i = i + 2) {
216 String target = transforms[i];
217 String result = transforms[i+1];
218 processed = processed.replaceAll(target, result);
219 }
220 }
221 //DebugStream.println("\n*** Transform: " + transform + " ***");
222 //DebugStream.println("*** Raw : '" + raw + "'");
223 //DebugStream.println("*** Processed: '" + processed + "'");
224 // If cache is at maximum size, empty it and start again
225 if(CACHE.size() == MAX_CACHE_SIZE) {
226 CACHE.clear();
227 }
228 CACHE.put(transform, raw, processed);
229 }
230 return processed;
231 }
232
233 /** Transform either of the accepted unicode escape sequences styles from in the string into single characters */
234 static final private char AND_CHAR = '&';
235 static final private char ESCAPE_CHAR = '\\';
236 static final private char HASH_CHAR = '#';
237 static final private char LOWER_U_CHAR = 'u';
238 static final private char UPPER_U_CHAR = 'U';
239 static final private char SEMICOLON_CHAR = ';';
240
241 static public String transformUnicode(String raw) {
242 StringBuffer processed = new StringBuffer();
243 int index = 0;
244 int raw_length = raw.length();
245 while(index < raw_length) {
246 char c0 = raw.charAt(index);
247 switch(c0) {
248 case AND_CHAR:
249 if(index + 1 < raw_length) {
250 // First the HTML &#231; type
251 char c1 = raw.charAt(index + 1);
252 if(c1 == HASH_CHAR) {
253 StringBuffer number_str = new StringBuffer();
254 char c2;
255 int offset = 2;
256 while(index + offset < raw_length && (c2 = raw.charAt(index + offset)) != SEMICOLON_CHAR) {
257 number_str.append(c2);
258 offset++;
259 }
260 // We've either run out of characters or have parsed a number
261 if(index + offset < raw_length && raw.charAt(index + offset) == SEMICOLON_CHAR) {
262 int number = Integer.parseInt(number_str.toString());
263 processed.append((char)number);
264 index = index + offset;
265 number_str = null;
266 break;
267 }
268 number_str = null;
269 }
270 }
271 processed.append(c0);
272 break;
273 case ESCAPE_CHAR:
274 // Now the \u00e7 type
275 if(index + 1 < raw_length) {
276 char c3 = raw.charAt(index + 1);
277 if((c3 == UPPER_U_CHAR || c3 == LOWER_U_CHAR) && index + 5 < raw_length) {
278 // We read four digits
279 String hex_str = raw.substring(index + 2, index + 6);
280 int number = Integer.parseInt(hex_str, 16);
281 hex_str = null;
282 processed.append((char)number);
283 index = index + 5;
284 break;
285 }
286 }
287 processed.append(c0);
288 break;
289 default:
290 processed.append(c0);
291 }
292 index++;
293 }
294 return processed.toString();
295 }
296
297 static public void main(String[] args) {
298 if(args.length < 2) {
299 String processed;
300 String raw;
301 String transform;
302
303 System.err.println("Running Test Suite");
304
305 transform = "DOM_TO_GREENSTONE";
306 System.err.println("Test " + transform);
307 raw = "A &amp;lt;\nand a &lt;a href=&quot;here.html&quot;&gt;&lt;font size=&apos;2&apos;&gt;URL&lt;/font&gt;&lt;/a&gt;";
308 System.err.println("Raw: '" + raw + "'");
309 processed = transform(raw, transform);
310 System.err.println("Processed: '" + processed + "'");
311
312 transform = "DOM_TO_TEXT";
313 System.err.println("Test " + transform);
314 raw = "A &amp;lt;\nand a &lt;a href=&quot;here.html&quot;&gt;&lt;font size=&apos;2&apos;&gt;URL&lt;/font&gt;&lt;/a&gt;";
315 System.err.println("Raw: '" + raw + "'");
316 processed = transform(raw, transform);
317 System.err.println("Processed: '" + processed + "'");
318
319 transform = "GREENSTONE_TO_DOM";
320 System.err.println("Test " + transform);
321 raw = "A &lt;\\nand a <a href=\\\"here.html\\\"><font size=\\\'2\\\'URL</font></a>";
322 System.err.println("Raw: '" + raw + "'");
323 processed = transform(raw, transform);
324 System.err.println("Processed: '" + processed + "'");
325
326 transform = "GREENSTONE_TO_TEXT";
327 System.err.println("Test " + transform);
328 raw = "These \\[ \\] should be escaped, and so should \\\\ that. These &quot; &apos; \\n are encoded.";
329 System.err.println("Raw: '" + raw + "'");
330 processed = transform(raw, transform);
331 System.err.println("Processed: '" + processed + "'");
332
333 transform = "TEXT_TO_DOM";
334 System.err.println("Test " + transform);
335 raw = "A &lt;\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
336 System.err.println("Raw: '" + raw + "'");
337 processed = transform(raw, transform);
338 System.err.println("Processed: '" + processed + "'");
339
340 transform = "TEXT_TO_GREENSTONE";
341 System.err.println("Test " + transform);
342 raw = "These [ ] should be escaped, and so should \\ that. These \" \' \n are encoded.";
343 System.err.println("Raw: '" + raw + "'");
344 processed = transform(raw, transform);
345 System.err.println("Processed: '" + processed + "'");
346
347 transform = "TEXT_TO_SHELL";
348 System.err.println("Test " + transform);
349 if(Utility.isWindows()) {
350 System.err.println("[Windows Version]");
351 transform = "TEXT_TO_SHELL_WINDOWS";
352 }
353 else {
354 System.err.println("[Unix Version]");
355 transform = "TEXT_TO_SHELL_UNIX";
356 }
357 raw = "A &lt;\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
358 System.err.println("Raw: '" + raw + "'");
359 processed = transform(raw, transform);
360 System.err.println("Processed: '" + processed + "'");
361
362 System.err.println("***** UNICODE TEST *****");
363 System.err.println("\\u0030 => " + transformUnicode("\\u0030"));
364 System.err.println("\\u0041 => " + transformUnicode("\\u0041"));
365 System.err.println("\\u007a => " + transformUnicode("\\u007a"));
366 System.err.println("\\u00e7 => " + transformUnicode("\\u00e7"));
367 System.err.println("&#48; => " + transformUnicode("&#48;"));
368 System.err.println("&#65; => " + transformUnicode("&#65;"));
369 System.err.println("&#122; => " + transformUnicode("&#122;"));
370 System.err.println("&#231; => " + transformUnicode("&#231;"));
371 }
372 else {
373 System.err.println("Raw: '" + args[0] + "'");
374 System.err.println("Transform: " + args[1]);
375 String processed = transform(args[0], args[1]);
376 System.err.println("Processed: '" + processed + "'");
377 }
378 }
379}
Note: See TracBrowser for help on using the repository browser.