source: trunk/gli/src/org/greenstone/gatherer/util/Codec.java@ 8240

Last change on this file since 8240 was 8240, checked in by mdewsnip, 20 years ago

Removed unnecessary imports of org.greenstone.gatherer.Gatherer.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.9 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * Author: John Thompson, Greenstone Digital Library, University of Waikato
9 *
10 * Copyright (C) 1999 New Zealand Digital Library Project
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *########################################################################
26 */
27package org.greenstone.gatherer.util;
28/*************************************************************************
29 * Written: 17-08-03
30 ************************************************************************/
31import java.util.*;
32import org.greenstone.gatherer.util.Utility;
33/** Provides a standard, extensible way to convert from one format of string to another (given that each format has differing requirements regarding legal characters and escaped characters)
34 * @author John Thompson, Greenstone Digital Library, University of Waikato
35 * @version 2.3d
36 */
37public class Codec {
38
39 static final public String DECODE_PATH = "DECODE_PATH";
40 static final public String DECODE_SQUARE_BRACKETS = "DECODE_SQUARE_BRACKETS";
41 static final public String DOM_TO_GREENSTONE = "DOM_TO_GREENSTONE";
42 static final public String DOM_TO_TEXT = "DOM_TO_TEXT";
43 static final public String ENCODE_PATH = "ENCODE_PATH";
44 static final public String ENCODE_SQUARE_BRACKETS = "ENCODE_SQUARE_BRACKETS";
45 static final public String GREENSTONE_TO_DOM = "GREENSTONE_TO_DOM";
46 static final public String GREENSTONE_TO_TEXT = "GREENSTONE_TO_TEXT";
47 static final public String TEXT_TO_DOM = "TEXT_TO_DOM";
48 static final public String TEXT_TO_GREENSTONE = "TEXT_TO_GREENSTONE";
49 static final public String TEXT_TO_REGEXP = "TEXT_TO_REGEXP";
50 static final public String TEXT_TO_SHELL_UNIX = "TEXT_TO_SHELL_UNIX";
51 static final public String TEXT_TO_SHELL_WINDOWS = "TEXT_TO_SHELL_WINDOWS";
52
53 static final private int MAX_CACHE_SIZE = 100;
54
55 static private HashMap TRANSFORMS;
56 static private HashMap3D CACHE;
57
58 /** Static function called to construct TRANSFORMS mappings */
59 static {
60 TRANSFORMS = new HashMap();
61
62 String[] decode_path = {
63 "\\|", "\\\\",
64 "|", "\\|"
65 };
66 TRANSFORMS.put(DECODE_PATH, decode_path);
67 decode_path = null;
68
69 // Transform text into text, but without [ and ]
70 String[] decode_square_brackets = {
71 "[", "\\[",
72 "]", "\\]"
73 };
74 TRANSFORMS.put(DECODE_SQUARE_BRACKETS, decode_square_brackets);
75 decode_square_brackets = null;
76
77 // Translate DOM encoded text into Greenstone encoding
78 String[] dom_to_greenstone = {
79 "'", "\\\\\'",
80 ">", ">",
81 "&lt;", "<",
82 "&quot;", "\\\\\"",
83 "&amp;", "&"
84 };
85 // removed "\n", "\\\\n", - config files are allowed new lines
86 // added "\\|", "\\\\"
87
88 TRANSFORMS.put(DOM_TO_GREENSTONE, dom_to_greenstone);
89 dom_to_greenstone = null;
90
91 // Transform DOM encoded text into plain text
92 String[] dom_to_text = {
93 "&amp;#091;", "\\[",
94 "&amp;#093;", "\\]",
95 "&apos;", "\'",
96 "&gt;", ">",
97 "&lt;", "<",
98 "&quot;", "\"",
99 "&amp;", "&"
100 };
101 TRANSFORMS.put(DOM_TO_TEXT, dom_to_text);
102 dom_to_text = null;
103
104 // Transform text into a regular expression that will match it
105 String[] text_to_regexp = {
106 "\\\\", "\\\\\\\\",
107 "\\(", "\\\\(",
108 "\\)", "\\\\)",
109 "\\[", "\\\\[",
110 "\\]", "\\\\]",
111 "\\{", "\\\\{",
112 "\\}", "\\\\}",
113 "\\.", "\\\\."
114 };
115 TRANSFORMS.put(TEXT_TO_REGEXP, text_to_regexp);
116 text_to_regexp = null;
117
118 String[] encode_path = {
119 "\\|", "&#124;",
120 "\\\\", "\\|"
121 };
122 TRANSFORMS.put(ENCODE_PATH, encode_path);
123 encode_path = null;
124
125 // Transform text into text, but without [ and ]
126 String[] encode_square_brackets = {
127 "\\[", "&#091;",
128 "\\]", "&#093;"
129 };
130 TRANSFORMS.put(ENCODE_SQUARE_BRACKETS, encode_square_brackets);
131 encode_square_brackets = null;
132
133 // Transform Greenstone encoded text to DOM encoding
134 String[] greenstone_to_dom = {
135 "&", "&amp;",
136 "<", "&lt;",
137 ">", "&gt;",
138 "\\\\\"", "&quot;",
139 "\\\\\'", "&apos;",
140 "\"", "&quot;",
141 "\'", "&apos;"
142 };
143 // removed"\\\\n", "\n", added "\\\\", "\\|"
144
145 TRANSFORMS.put(GREENSTONE_TO_DOM, greenstone_to_dom);
146 greenstone_to_dom = null;
147
148 // Transform Greenstone encoded text to plain text
149 String[] greenstone_to_text = {
150 "\\\\\"", "\"",
151 "\\\\\'", "\'",
152 "&quot;", "\"",
153 "&apos;", "\'",
154 "&#091;", "\\[",
155 "&#093;", "\\]"
156 };
157 // removed "\\\\n", "\n", "\\|", "\\\\"
158
159 TRANSFORMS.put(GREENSTONE_TO_TEXT, greenstone_to_text);
160 greenstone_to_text = null;
161
162 // Transform plain html text into something that can be placed in a DOM
163 String[] text_to_dom = {
164 "&", "&amp;",
165 "<", "&lt;",
166 ">", "&gt;",
167 "\"", "&quot;",
168 "\'", "&apos;"
169 };
170 TRANSFORMS.put(TEXT_TO_DOM, text_to_dom);
171 text_to_dom = null;
172
173 // Transform plain html text into greenstone encoding
174 String[] text_to_greenstone = {
175
176 "\\[", "&#091;",
177 "\\]", "&#093;",
178 "\"", "&quot;",
179 "\n", "\\\\n"
180 };
181 // "\'", "&apos;",
182 // removed "\\\\", "\\|",
183 TRANSFORMS.put(TEXT_TO_GREENSTONE, text_to_greenstone);
184 text_to_greenstone = null;
185
186 // Transform plain html text into something that can be placed in a shell command
187 String[] text_to_shell_unix = {
188 "\"", "\\\\\"",
189 "\'", "\\\\\'",
190 "\n", "\\\\n"
191 };
192 TRANSFORMS.put(TEXT_TO_SHELL_UNIX, text_to_shell_unix);
193 text_to_shell_unix = null;
194
195 // Transform plain html text into something that can be placed in a shell command. Windows requires twice as many escaped for speech marks to be passed to underlying processes
196 String[] text_to_shell_windows = {
197 "\"", "\\\\\\\\\\\\\"",
198 "\'", "\\\\\'",
199 "\n", "\\\\n"
200 };
201 TRANSFORMS.put(TEXT_TO_SHELL_WINDOWS, text_to_shell_windows);
202 text_to_shell_windows = null;
203
204 CACHE = new HashMap3D();
205 }
206
207 static public String transform(String raw, String transform) {
208 if(raw == null) {
209 return raw;
210 }
211 // System.err.println("Transforming by "+transform+":\n" + raw);
212 String processed = (String) CACHE.get(transform, raw);
213 if(processed == null) {
214 processed = raw;
215 String[] transforms = (String[]) TRANSFORMS.get(transform);
216 if(transforms != null) {
217 for(int i = 0; i < transforms.length; i = i + 2) {
218 String target = transforms[i];
219 String result = transforms[i+1];
220 processed = processed.replaceAll(target, result);
221 }
222 }
223 //DebugStream.println("\n*** Transform: " + transform + " ***");
224 //DebugStream.println("*** Raw : '" + raw + "'");
225 //DebugStream.println("*** Processed: '" + processed + "'");
226 // If cache is at maximum size, empty it and start again
227 if(CACHE.size() == MAX_CACHE_SIZE) {
228 CACHE.clear();
229 }
230 CACHE.put(transform, raw, processed);
231 }
232 return processed;
233 }
234
235 /** Transform either of the accepted unicode escape sequences styles from in the string into single characters */
236 static final private char AND_CHAR = '&';
237 static final private char ESCAPE_CHAR = '\\';
238 static final private char HASH_CHAR = '#';
239 static final private char LOWER_U_CHAR = 'u';
240 static final private char UPPER_U_CHAR = 'U';
241 static final private char SEMICOLON_CHAR = ';';
242
243 static public String transformUnicode(String raw) {
244 StringBuffer processed = new StringBuffer();
245 int index = 0;
246 int raw_length = raw.length();
247 while(index < raw_length) {
248 char c0 = raw.charAt(index);
249 switch(c0) {
250 case AND_CHAR:
251 if(index + 1 < raw_length) {
252 // First the HTML &#231; type
253 char c1 = raw.charAt(index + 1);
254 if(c1 == HASH_CHAR) {
255 StringBuffer number_str = new StringBuffer();
256 char c2;
257 int offset = 2;
258 while(index + offset < raw_length && (c2 = raw.charAt(index + offset)) != SEMICOLON_CHAR) {
259 number_str.append(c2);
260 offset++;
261 }
262 // We've either run out of characters or have parsed a number
263 if(index + offset < raw_length && raw.charAt(index + offset) == SEMICOLON_CHAR) {
264 int number = Integer.parseInt(number_str.toString());
265 processed.append((char)number);
266 index = index + offset;
267 number_str = null;
268 break;
269 }
270 number_str = null;
271 }
272 }
273 processed.append(c0);
274 break;
275 case ESCAPE_CHAR:
276 // Now the \u00e7 type
277 if(index + 1 < raw_length) {
278 char c3 = raw.charAt(index + 1);
279 if((c3 == UPPER_U_CHAR || c3 == LOWER_U_CHAR) && index + 5 < raw_length) {
280 // We read four digits
281 String hex_str = raw.substring(index + 2, index + 6);
282 int number = Integer.parseInt(hex_str, 16);
283 hex_str = null;
284 processed.append((char)number);
285 index = index + 5;
286 break;
287 }
288 }
289 processed.append(c0);
290 break;
291 default:
292 processed.append(c0);
293 }
294 index++;
295 }
296 return processed.toString();
297 }
298
299 static public void main(String[] args) {
300 if(args.length < 2) {
301 String processed;
302 String raw;
303 String transform;
304
305 System.err.println("Running Test Suite");
306
307 transform = "DOM_TO_GREENSTONE";
308 System.err.println("Test " + transform);
309 raw = "A &amp;lt;\nand a &lt;a href=&quot;here.html&quot;&gt;&lt;font size=&apos;2&apos;&gt;URL&lt;/font&gt;&lt;/a&gt;";
310 System.err.println("Raw: '" + raw + "'");
311 processed = transform(raw, transform);
312 System.err.println("Processed: '" + processed + "'");
313
314 transform = "DOM_TO_TEXT";
315 System.err.println("Test " + transform);
316 raw = "A &amp;lt;\nand a &lt;a href=&quot;here.html&quot;&gt;&lt;font size=&apos;2&apos;&gt;URL&lt;/font&gt;&lt;/a&gt;";
317 System.err.println("Raw: '" + raw + "'");
318 processed = transform(raw, transform);
319 System.err.println("Processed: '" + processed + "'");
320
321 transform = "GREENSTONE_TO_DOM";
322 System.err.println("Test " + transform);
323 raw = "A &lt;\\nand a <a href=\\\"here.html\\\"><font size=\\\'2\\\'URL</font></a>";
324 System.err.println("Raw: '" + raw + "'");
325 processed = transform(raw, transform);
326 System.err.println("Processed: '" + processed + "'");
327
328 transform = "GREENSTONE_TO_TEXT";
329 System.err.println("Test " + transform);
330 raw = "These \\[ \\] should be escaped, and so should \\\\ that. These &quot; &apos; \\n are encoded.";
331 System.err.println("Raw: '" + raw + "'");
332 processed = transform(raw, transform);
333 System.err.println("Processed: '" + processed + "'");
334
335 transform = "TEXT_TO_DOM";
336 System.err.println("Test " + transform);
337 raw = "A &lt;\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
338 System.err.println("Raw: '" + raw + "'");
339 processed = transform(raw, transform);
340 System.err.println("Processed: '" + processed + "'");
341
342 transform = "TEXT_TO_GREENSTONE";
343 System.err.println("Test " + transform);
344 raw = "These [ ] should be escaped, and so should \\ that. These \" \' \n are encoded.";
345 System.err.println("Raw: '" + raw + "'");
346 processed = transform(raw, transform);
347 System.err.println("Processed: '" + processed + "'");
348
349 transform = "TEXT_TO_SHELL";
350 System.err.println("Test " + transform);
351 if(Utility.isWindows()) {
352 System.err.println("[Windows Version]");
353 transform = "TEXT_TO_SHELL_WINDOWS";
354 }
355 else {
356 System.err.println("[Unix Version]");
357 transform = "TEXT_TO_SHELL_UNIX";
358 }
359 raw = "A &lt;\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
360 System.err.println("Raw: '" + raw + "'");
361 processed = transform(raw, transform);
362 System.err.println("Processed: '" + processed + "'");
363
364 System.err.println("***** UNICODE TEST *****");
365 System.err.println("\\u0030 => " + transformUnicode("\\u0030"));
366 System.err.println("\\u0041 => " + transformUnicode("\\u0041"));
367 System.err.println("\\u007a => " + transformUnicode("\\u007a"));
368 System.err.println("\\u00e7 => " + transformUnicode("\\u00e7"));
369 System.err.println("&#48; => " + transformUnicode("&#48;"));
370 System.err.println("&#65; => " + transformUnicode("&#65;"));
371 System.err.println("&#122; => " + transformUnicode("&#122;"));
372 System.err.println("&#231; => " + transformUnicode("&#231;"));
373 }
374 else {
375 System.err.println("Raw: '" + args[0] + "'");
376 System.err.println("Transform: " + args[1]);
377 String processed = transform(args[0], args[1]);
378 System.err.println("Processed: '" + processed + "'");
379 }
380 }
381}
Note: See TracBrowser for help on using the repository browser.