source: trunk/gli/src/org/greenstone/gatherer/util/Codec.java@ 5807

Last change on this file since 5807 was 5805, checked in by jmt12, 21 years ago

Several changes needed to make pipe the hierarchy separator while keeping backslash as what the user sees

  • Property svn:keywords set to Author Date Id Revision
File size: 11.6 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * Author: John Thompson, Greenstone Digital Library, University of Waikato
9 *
10 * Copyright (C) 1999 New Zealand Digital Library Project
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *########################################################################
26 */
27package org.greenstone.gatherer.util;
28/*************************************************************************
29 * Written: 17-08-03
30 ************************************************************************/
31import java.util.*;
32import org.greenstone.gatherer.Gatherer;
33import org.greenstone.gatherer.util.Utility;
34/** Provides a standard, extensible way to convert from one format of string to another (given that each format has differing requirements regarding legal characters and escaped characters)
35 * @author John Thompson, Greenstone Digital Library, University of Waikato
36 * @version 2.3d
37 */
38public class Codec {
39
40 static final public String DOM_TO_GREENSTONE = "DOM_TO_GREENSTONE";
41 static final public String DOM_TO_TEXT = "DOM_TO_TEXT";
42 static final public String GREENSTONE_TO_DOM = "GREENSTONE_TO_DOM";
43 static final public String GREENSTONE_TO_TEXT = "GREENSTONE_TO_TEXT";
44 static final public String REMOVE_SQUARE_BRACKET = "REMOVE_SQUARE_BRACKET";
45 static final public String TEXT_TO_DOM = "TEXT_TO_DOM";
46 static final public String TEXT_TO_GREENSTONE = "TEXT_TO_GREENSTONE";
47 static final public String TEXT_TO_SHELL_UNIX = "TEXT_TO_SHELL_UNIX";
48 static final public String TEXT_TO_SHELL_WINDOWS = "TEXT_TO_SHELL_WINDOWS";
49
50 static final private int MAX_CACHE_SIZE = 100;
51
52 static private HashMap TRANSFORMS;
53 static private HashMap3D CACHE;
54
55
56 /** Static function called to construct TRANSFORMS mappings */
57 static {
58 TRANSFORMS = new HashMap();
59
60 // Translate DOM encoded text into Greenstone encoding
61 String[] dom_to_greenstone = {
62 "'", "\\\\\'",
63 ">", ">",
64 "&lt;", "<",
65 "&quot;", "\\\\\"",
66 "\n", "\\\\n",
67 "&amp;", "&"
68 };
69 TRANSFORMS.put(DOM_TO_GREENSTONE, dom_to_greenstone);
70 dom_to_greenstone = null;
71
72 // Transform DOM encoded text into plain text
73 String[] dom_to_text = {
74 "&amp;#091;", "\\[",
75 "&amp;#093;", "\\]",
76 "&apos;", "\'",
77 "&gt;", ">",
78 "&lt;", "<",
79 "&quot;", "\"",
80 "&amp;", "&",
81 "\\|", "\\\\"
82 };
83 TRANSFORMS.put(DOM_TO_TEXT, dom_to_text);
84 dom_to_text = null;
85
86 // Transform Greenstone encoded text to DOM encoding
87 String[] greenstone_to_dom = {
88 "&", "&amp;",
89 "<", "&lt;",
90 ">", "&gt;",
91 "\\\\\"", "&quot;",
92 "\\\\\'", "&apos;",
93 "\\\\n", "\n"
94 };
95 TRANSFORMS.put(GREENSTONE_TO_DOM, greenstone_to_dom);
96 greenstone_to_dom = null;
97
98 // Transform Greenstone encoded text to plain text
99 String[] greenstone_to_text = {
100 "\\\\\"", "\"",
101 "\\\\\'", "\'",
102 "\\\\n", "\n",
103 "&quot;", "\"",
104 "&apos;", "\'",
105 "&#091;", "\\[",
106 "&#093;", "\\]",
107 "\\|", "\\\\"
108 };
109 TRANSFORMS.put(GREENSTONE_TO_TEXT, greenstone_to_text);
110 greenstone_to_text = null;
111
112 // Transform text into text, but without [ and ]
113 String[] remove_square_bracket = {
114 "\\[", "&amp;#091;",
115 "\\]", "&amp;#093;"
116 };
117 TRANSFORMS.put(REMOVE_SQUARE_BRACKET, remove_square_bracket);
118 remove_square_bracket = null;
119
120 // Transform plain html text into something that can be placed in a DOM
121 String[] text_to_dom = {
122 "&", "&amp;",
123 "<", "&lt;",
124 ">", "&gt;",
125 "\"", "&quot;",
126 "\'", "&apos;",
127 "\\\\", "\\|",
128 };
129 TRANSFORMS.put(TEXT_TO_DOM, text_to_dom);
130 text_to_dom = null;
131
132 // Transform plain html text into greenstone encoding
133 String[] text_to_greenstone = {
134 "\\\\", "\\|",
135 "\\[", "&#091;",
136 "\\]", "&#093;",
137 "\"", "&quot;",
138 "\'", "&apos;",
139 "\n", "\\\\n"
140 };
141 TRANSFORMS.put(TEXT_TO_GREENSTONE, text_to_greenstone);
142 text_to_greenstone = null;
143
144 // Transform plain html text into something that can be placed in a shell command
145 String[] text_to_shell_unix = {
146 "\"", "\\\\\"",
147 "\'", "\\\\\'",
148 "\n", "\\\\n"
149 };
150 TRANSFORMS.put(TEXT_TO_SHELL_UNIX, text_to_shell_unix);
151 text_to_shell_unix = null;
152
153 // Transform plain html text into something that can be placed in a shell command. Windows requires twice as many escaped for speech marks to be passed to underlying processes
154 String[] text_to_shell_windows = {
155 "\"", "\\\\\\\\\\\\\"",
156 "\'", "\\\\\'",
157 "\n", "\\\\n"
158 };
159 TRANSFORMS.put(TEXT_TO_SHELL_WINDOWS, text_to_shell_windows);
160 text_to_shell_windows = null;
161
162 CACHE = new HashMap3D();
163 }
164
165 static public String transform(String raw, String transform) {
166 String processed = (String) CACHE.get(transform, raw);
167 if(processed == null) {
168 processed = raw;
169 String[] transforms = (String[]) TRANSFORMS.get(transform);
170 if(transforms != null) {
171 for(int i = 0; i < transforms.length; i = i + 2) {
172 String target = transforms[i];
173 String result = transforms[i+1];
174 processed = processed.replaceAll(target, result);
175 }
176 }
177 ///atherer.println("*** Transform: " + transform + " ***");
178 ///atherer.println("*** Raw : '" + raw + "'");
179 ///atherer.println("*** Processed: '" + processed + "'");
180 // If cache is at maximum size, empty it and start again
181 if(CACHE.size() == MAX_CACHE_SIZE) {
182 CACHE.clear();
183 }
184 CACHE.put(transform, raw, processed);
185 }
186 return processed;
187 }
188
189 /** Transform either of the accepted unicode escape sequences styles from in the string into single characters */
190 static final private char AND_CHAR = '&';
191 static final private char ESCAPE_CHAR = '\\';
192 static final private char HASH_CHAR = '#';
193 static final private char LOWER_U_CHAR = 'u';
194 static final private char UPPER_U_CHAR = 'U';
195 static final private char SEMICOLON_CHAR = ';';
196
197 static public String transformUnicode(String raw) {
198 StringBuffer processed = new StringBuffer();
199 int index = 0;
200 int raw_length = raw.length();
201 while(index < raw_length) {
202 char c0 = raw.charAt(index);
203 switch(c0) {
204 case AND_CHAR:
205 if(index + 1 < raw_length) {
206 // First the HTML &#231; type
207 char c1 = raw.charAt(index + 1);
208 if(c1 == HASH_CHAR) {
209 StringBuffer number_str = new StringBuffer();
210 char c2;
211 int offset = 2;
212 while(index + offset < raw_length && (c2 = raw.charAt(index + offset)) != SEMICOLON_CHAR) {
213 number_str.append(c2);
214 offset++;
215 }
216 // We've either run out of characters or have parsed a number
217 if(index + offset < raw_length && raw.charAt(index + offset) == SEMICOLON_CHAR) {
218 int number = Integer.parseInt(number_str.toString());
219 processed.append((char)number);
220 index = index + offset;
221 number_str = null;
222 break;
223 }
224 number_str = null;
225 }
226 }
227 processed.append(c0);
228 break;
229 case ESCAPE_CHAR:
230 // Now the \u00e7 type
231 if(index + 1 < raw_length) {
232 char c3 = raw.charAt(index + 1);
233 if((c3 == UPPER_U_CHAR || c3 == LOWER_U_CHAR) && index + 5 < raw_length) {
234 // We read four digits
235 String hex_str = raw.substring(index + 2, index + 6);
236 int number = Integer.parseInt(hex_str, 16);
237 hex_str = null;
238 processed.append((char)number);
239 index = index + 5;
240 break;
241 }
242 }
243 processed.append(c0);
244 break;
245 default:
246 processed.append(c0);
247 }
248 index++;
249 }
250 return processed.toString();
251 }
252
253 static public void main(String[] args) {
254 if(args.length < 2) {
255 String processed;
256 String raw;
257 String transform;
258
259 System.err.println("Running Test Suite");
260
261 transform = "DOM_TO_GREENSTONE";
262 System.err.println("Test " + transform);
263 raw = "A &amp;lt;\nand a &lt;a href=&quot;here.html&quot;&gt;&lt;font size=&apos;2&apos;&gt;URL&lt;/font&gt;&lt;/a&gt;";
264 System.err.println("Raw: '" + raw + "'");
265 processed = transform(raw, transform);
266 System.err.println("Processed: '" + processed + "'");
267
268 transform = "DOM_TO_TEXT";
269 System.err.println("Test " + transform);
270 raw = "A &amp;lt;\nand a &lt;a href=&quot;here.html&quot;&gt;&lt;font size=&apos;2&apos;&gt;URL&lt;/font&gt;&lt;/a&gt;";
271 System.err.println("Raw: '" + raw + "'");
272 processed = transform(raw, transform);
273 System.err.println("Processed: '" + processed + "'");
274
275 transform = "GREENSTONE_TO_DOM";
276 System.err.println("Test " + transform);
277 raw = "A &lt;\\nand a <a href=\\\"here.html\\\"><font size=\\\'2\\\'URL</font></a>";
278 System.err.println("Raw: '" + raw + "'");
279 processed = transform(raw, transform);
280 System.err.println("Processed: '" + processed + "'");
281
282 transform = "GREENSTONE_TO_TEXT";
283 System.err.println("Test " + transform);
284 raw = "These \\[ \\] should be escaped, and so should \\\\ that. These &quot; &apos; \\n are encoded.";
285 System.err.println("Raw: '" + raw + "'");
286 processed = transform(raw, transform);
287 System.err.println("Processed: '" + processed + "'");
288
289 transform = "TEXT_TO_DOM";
290 System.err.println("Test " + transform);
291 raw = "A &lt;\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
292 System.err.println("Raw: '" + raw + "'");
293 processed = transform(raw, transform);
294 System.err.println("Processed: '" + processed + "'");
295
296 transform = "TEXT_TO_GREENSTONE";
297 System.err.println("Test " + transform);
298 raw = "These [ ] should be escaped, and so should \\ that. These \" \' \n are encoded.";
299 System.err.println("Raw: '" + raw + "'");
300 processed = transform(raw, transform);
301 System.err.println("Processed: '" + processed + "'");
302
303 transform = "TEXT_TO_SHELL";
304 System.err.println("Test " + transform);
305 if(Utility.isWindows()) {
306 System.err.println("[Windows Version]");
307 transform = "TEXT_TO_SHELL_WINDOWS";
308 }
309 else {
310 System.err.println("[Unix Version]");
311 transform = "TEXT_TO_SHELL_UNIX";
312 }
313 raw = "A &lt;\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
314 System.err.println("Raw: '" + raw + "'");
315 processed = transform(raw, transform);
316 System.err.println("Processed: '" + processed + "'");
317
318 System.err.println("***** UNICODE TEST *****");
319 System.err.println("\\u0030 => " + transformUnicode("\\u0030"));
320 System.err.println("\\u0041 => " + transformUnicode("\\u0041"));
321 System.err.println("\\u007a => " + transformUnicode("\\u007a"));
322 System.err.println("\\u00e7 => " + transformUnicode("\\u00e7"));
323 System.err.println("&#48; => " + transformUnicode("&#48;"));
324 System.err.println("&#65; => " + transformUnicode("&#65;"));
325 System.err.println("&#122; => " + transformUnicode("&#122;"));
326 System.err.println("&#231; => " + transformUnicode("&#231;"));
327 }
328 else {
329 System.err.println("Raw: '" + args[0] + "'");
330 System.err.println("Transform: " + args[1]);
331 String processed = transform(args[0], args[1]);
332 System.err.println("Processed: '" + processed + "'");
333 }
334 }
335}
Note: See TracBrowser for help on using the repository browser.