1 | /**
|
---|
2 | *#########################################################################
|
---|
3 | *
|
---|
4 | * A component of the Gatherer application, part of the Greenstone digital
|
---|
5 | * library suite from the New Zealand Digital Library Project at the
|
---|
6 | * University of Waikato, New Zealand.
|
---|
7 | *
|
---|
8 | * Author: John Thompson, Greenstone Digital Library, University of Waikato
|
---|
9 | *
|
---|
10 | * Copyright (C) 1999 New Zealand Digital Library Project
|
---|
11 | *
|
---|
12 | * This program is free software; you can redistribute it and/or modify
|
---|
13 | * it under the terms of the GNU General Public License as published by
|
---|
14 | * the Free Software Foundation; either version 2 of the License, or
|
---|
15 | * (at your option) any later version.
|
---|
16 | *
|
---|
17 | * This program is distributed in the hope that it will be useful,
|
---|
18 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
19 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
20 | * GNU General Public License for more details.
|
---|
21 | *
|
---|
22 | * You should have received a copy of the GNU General Public License
|
---|
23 | * along with this program; if not, write to the Free Software
|
---|
24 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
25 | *########################################################################
|
---|
26 | */
|
---|
27 | package org.greenstone.gatherer.util;
|
---|
28 | /*************************************************************************
|
---|
29 | * Written: 17-08-03
|
---|
30 | ************************************************************************/
|
---|
31 | import java.util.*;
|
---|
32 | import org.greenstone.gatherer.Gatherer;
|
---|
33 | import org.greenstone.gatherer.util.Utility;
|
---|
34 | /** Provides a standard, extensible way to convert from one format of string to another (given that each format has differing requirements regarding legal characters and escaped characters)
|
---|
35 | * @author John Thompson, Greenstone Digital Library, University of Waikato
|
---|
36 | * @version 2.3d
|
---|
37 | */
|
---|
38 | public class Codec {
|
---|
39 |
|
---|
40 | static final public String DOM_TO_GREENSTONE = "DOM_TO_GREENSTONE";
|
---|
41 | static final public String DOM_TO_TEXT = "DOM_TO_TEXT";
|
---|
42 | static final public String GREENSTONE_TO_DOM = "GREENSTONE_TO_DOM";
|
---|
43 | static final public String GREENSTONE_TO_TEXT = "GREENSTONE_TO_TEXT";
|
---|
44 | static final public String REMOVE_SQUARE_BRACKET = "REMOVE_SQUARE_BRACKET";
|
---|
45 | static final public String TEXT_TO_DOM = "TEXT_TO_DOM";
|
---|
46 | static final public String TEXT_TO_GREENSTONE = "TEXT_TO_GREENSTONE";
|
---|
47 | static final public String TEXT_TO_SHELL_UNIX = "TEXT_TO_SHELL_UNIX";
|
---|
48 | static final public String TEXT_TO_SHELL_WINDOWS = "TEXT_TO_SHELL_WINDOWS";
|
---|
49 |
|
---|
50 | static final private int MAX_CACHE_SIZE = 100;
|
---|
51 |
|
---|
52 | static private HashMap TRANSFORMS;
|
---|
53 | static private HashMap3D CACHE;
|
---|
54 |
|
---|
55 |
|
---|
56 | /** Static function called to construct TRANSFORMS mappings */
|
---|
57 | static {
|
---|
58 | TRANSFORMS = new HashMap();
|
---|
59 |
|
---|
60 | // Translate DOM encoded text into Greenstone encoding
|
---|
61 | String[] dom_to_greenstone = {
|
---|
62 | "'", "\\\\\'",
|
---|
63 | ">", ">",
|
---|
64 | "<", "<",
|
---|
65 | """, "\\\\\"",
|
---|
66 | "\n", "\\\\n",
|
---|
67 | "&", "&"
|
---|
68 | };
|
---|
69 | TRANSFORMS.put(DOM_TO_GREENSTONE, dom_to_greenstone);
|
---|
70 | dom_to_greenstone = null;
|
---|
71 |
|
---|
72 | // Transform DOM encoded text into plain text
|
---|
73 | String[] dom_to_text = {
|
---|
74 | "&#091;", "\\[",
|
---|
75 | "&#093;", "\\]",
|
---|
76 | "'", "\'",
|
---|
77 | ">", ">",
|
---|
78 | "<", "<",
|
---|
79 | """, "\"",
|
---|
80 | "&", "&",
|
---|
81 | "\\|", "\\\\"
|
---|
82 | };
|
---|
83 | TRANSFORMS.put(DOM_TO_TEXT, dom_to_text);
|
---|
84 | dom_to_text = null;
|
---|
85 |
|
---|
86 | // Transform Greenstone encoded text to DOM encoding
|
---|
87 | String[] greenstone_to_dom = {
|
---|
88 | "&", "&",
|
---|
89 | "<", "<",
|
---|
90 | ">", ">",
|
---|
91 | "\\\\\"", """,
|
---|
92 | "\\\\\'", "'",
|
---|
93 | "\\\\n", "\n"
|
---|
94 | };
|
---|
95 | TRANSFORMS.put(GREENSTONE_TO_DOM, greenstone_to_dom);
|
---|
96 | greenstone_to_dom = null;
|
---|
97 |
|
---|
98 | // Transform Greenstone encoded text to plain text
|
---|
99 | String[] greenstone_to_text = {
|
---|
100 | "\\\\\"", "\"",
|
---|
101 | "\\\\\'", "\'",
|
---|
102 | "\\\\n", "\n",
|
---|
103 | """, "\"",
|
---|
104 | "'", "\'",
|
---|
105 | "[", "\\[",
|
---|
106 | "]", "\\]",
|
---|
107 | "\\|", "\\\\"
|
---|
108 | };
|
---|
109 | TRANSFORMS.put(GREENSTONE_TO_TEXT, greenstone_to_text);
|
---|
110 | greenstone_to_text = null;
|
---|
111 |
|
---|
112 | // Transform text into text, but without [ and ]
|
---|
113 | String[] remove_square_bracket = {
|
---|
114 | "\\[", "&#091;",
|
---|
115 | "\\]", "&#093;"
|
---|
116 | };
|
---|
117 | TRANSFORMS.put(REMOVE_SQUARE_BRACKET, remove_square_bracket);
|
---|
118 | remove_square_bracket = null;
|
---|
119 |
|
---|
120 | // Transform plain html text into something that can be placed in a DOM
|
---|
121 | String[] text_to_dom = {
|
---|
122 | "&", "&",
|
---|
123 | "<", "<",
|
---|
124 | ">", ">",
|
---|
125 | "\"", """,
|
---|
126 | "\'", "'",
|
---|
127 | "\\\\", "\\|",
|
---|
128 | };
|
---|
129 | TRANSFORMS.put(TEXT_TO_DOM, text_to_dom);
|
---|
130 | text_to_dom = null;
|
---|
131 |
|
---|
132 | // Transform plain html text into greenstone encoding
|
---|
133 | String[] text_to_greenstone = {
|
---|
134 | "\\\\", "\\|",
|
---|
135 | "\\[", "[",
|
---|
136 | "\\]", "]",
|
---|
137 | "\"", """,
|
---|
138 | "\'", "'",
|
---|
139 | "\n", "\\\\n"
|
---|
140 | };
|
---|
141 | TRANSFORMS.put(TEXT_TO_GREENSTONE, text_to_greenstone);
|
---|
142 | text_to_greenstone = null;
|
---|
143 |
|
---|
144 | // Transform plain html text into something that can be placed in a shell command
|
---|
145 | String[] text_to_shell_unix = {
|
---|
146 | "\"", "\\\\\"",
|
---|
147 | "\'", "\\\\\'",
|
---|
148 | "\n", "\\\\n"
|
---|
149 | };
|
---|
150 | TRANSFORMS.put(TEXT_TO_SHELL_UNIX, text_to_shell_unix);
|
---|
151 | text_to_shell_unix = null;
|
---|
152 |
|
---|
153 | // Transform plain html text into something that can be placed in a shell command. Windows requires twice as many escaped for speech marks to be passed to underlying processes
|
---|
154 | String[] text_to_shell_windows = {
|
---|
155 | "\"", "\\\\\\\\\\\\\"",
|
---|
156 | "\'", "\\\\\'",
|
---|
157 | "\n", "\\\\n"
|
---|
158 | };
|
---|
159 | TRANSFORMS.put(TEXT_TO_SHELL_WINDOWS, text_to_shell_windows);
|
---|
160 | text_to_shell_windows = null;
|
---|
161 |
|
---|
162 | CACHE = new HashMap3D();
|
---|
163 | }
|
---|
164 |
|
---|
165 | static public String transform(String raw, String transform) {
|
---|
166 | String processed = (String) CACHE.get(transform, raw);
|
---|
167 | if(processed == null) {
|
---|
168 | processed = raw;
|
---|
169 | String[] transforms = (String[]) TRANSFORMS.get(transform);
|
---|
170 | if(transforms != null) {
|
---|
171 | for(int i = 0; i < transforms.length; i = i + 2) {
|
---|
172 | String target = transforms[i];
|
---|
173 | String result = transforms[i+1];
|
---|
174 | processed = processed.replaceAll(target, result);
|
---|
175 | }
|
---|
176 | }
|
---|
177 | ///atherer.println("*** Transform: " + transform + " ***");
|
---|
178 | ///atherer.println("*** Raw : '" + raw + "'");
|
---|
179 | ///atherer.println("*** Processed: '" + processed + "'");
|
---|
180 | // If cache is at maximum size, empty it and start again
|
---|
181 | if(CACHE.size() == MAX_CACHE_SIZE) {
|
---|
182 | CACHE.clear();
|
---|
183 | }
|
---|
184 | CACHE.put(transform, raw, processed);
|
---|
185 | }
|
---|
186 | return processed;
|
---|
187 | }
|
---|
188 |
|
---|
189 | /** Transform either of the accepted unicode escape sequences styles from in the string into single characters */
|
---|
190 | static final private char AND_CHAR = '&';
|
---|
191 | static final private char ESCAPE_CHAR = '\\';
|
---|
192 | static final private char HASH_CHAR = '#';
|
---|
193 | static final private char LOWER_U_CHAR = 'u';
|
---|
194 | static final private char UPPER_U_CHAR = 'U';
|
---|
195 | static final private char SEMICOLON_CHAR = ';';
|
---|
196 |
|
---|
197 | static public String transformUnicode(String raw) {
|
---|
198 | StringBuffer processed = new StringBuffer();
|
---|
199 | int index = 0;
|
---|
200 | int raw_length = raw.length();
|
---|
201 | while(index < raw_length) {
|
---|
202 | char c0 = raw.charAt(index);
|
---|
203 | switch(c0) {
|
---|
204 | case AND_CHAR:
|
---|
205 | if(index + 1 < raw_length) {
|
---|
206 | // First the HTML ç type
|
---|
207 | char c1 = raw.charAt(index + 1);
|
---|
208 | if(c1 == HASH_CHAR) {
|
---|
209 | StringBuffer number_str = new StringBuffer();
|
---|
210 | char c2;
|
---|
211 | int offset = 2;
|
---|
212 | while(index + offset < raw_length && (c2 = raw.charAt(index + offset)) != SEMICOLON_CHAR) {
|
---|
213 | number_str.append(c2);
|
---|
214 | offset++;
|
---|
215 | }
|
---|
216 | // We've either run out of characters or have parsed a number
|
---|
217 | if(index + offset < raw_length && raw.charAt(index + offset) == SEMICOLON_CHAR) {
|
---|
218 | int number = Integer.parseInt(number_str.toString());
|
---|
219 | processed.append((char)number);
|
---|
220 | index = index + offset;
|
---|
221 | number_str = null;
|
---|
222 | break;
|
---|
223 | }
|
---|
224 | number_str = null;
|
---|
225 | }
|
---|
226 | }
|
---|
227 | processed.append(c0);
|
---|
228 | break;
|
---|
229 | case ESCAPE_CHAR:
|
---|
230 | // Now the \u00e7 type
|
---|
231 | if(index + 1 < raw_length) {
|
---|
232 | char c3 = raw.charAt(index + 1);
|
---|
233 | if((c3 == UPPER_U_CHAR || c3 == LOWER_U_CHAR) && index + 5 < raw_length) {
|
---|
234 | // We read four digits
|
---|
235 | String hex_str = raw.substring(index + 2, index + 6);
|
---|
236 | int number = Integer.parseInt(hex_str, 16);
|
---|
237 | hex_str = null;
|
---|
238 | processed.append((char)number);
|
---|
239 | index = index + 5;
|
---|
240 | break;
|
---|
241 | }
|
---|
242 | }
|
---|
243 | processed.append(c0);
|
---|
244 | break;
|
---|
245 | default:
|
---|
246 | processed.append(c0);
|
---|
247 | }
|
---|
248 | index++;
|
---|
249 | }
|
---|
250 | return processed.toString();
|
---|
251 | }
|
---|
252 |
|
---|
253 | static public void main(String[] args) {
|
---|
254 | if(args.length < 2) {
|
---|
255 | String processed;
|
---|
256 | String raw;
|
---|
257 | String transform;
|
---|
258 |
|
---|
259 | System.err.println("Running Test Suite");
|
---|
260 |
|
---|
261 | transform = "DOM_TO_GREENSTONE";
|
---|
262 | System.err.println("Test " + transform);
|
---|
263 | raw = "A &lt;\nand a <a href="here.html"><font size='2'>URL</font></a>";
|
---|
264 | System.err.println("Raw: '" + raw + "'");
|
---|
265 | processed = transform(raw, transform);
|
---|
266 | System.err.println("Processed: '" + processed + "'");
|
---|
267 |
|
---|
268 | transform = "DOM_TO_TEXT";
|
---|
269 | System.err.println("Test " + transform);
|
---|
270 | raw = "A &lt;\nand a <a href="here.html"><font size='2'>URL</font></a>";
|
---|
271 | System.err.println("Raw: '" + raw + "'");
|
---|
272 | processed = transform(raw, transform);
|
---|
273 | System.err.println("Processed: '" + processed + "'");
|
---|
274 |
|
---|
275 | transform = "GREENSTONE_TO_DOM";
|
---|
276 | System.err.println("Test " + transform);
|
---|
277 | raw = "A <\\nand a <a href=\\\"here.html\\\"><font size=\\\'2\\\'URL</font></a>";
|
---|
278 | System.err.println("Raw: '" + raw + "'");
|
---|
279 | processed = transform(raw, transform);
|
---|
280 | System.err.println("Processed: '" + processed + "'");
|
---|
281 |
|
---|
282 | transform = "GREENSTONE_TO_TEXT";
|
---|
283 | System.err.println("Test " + transform);
|
---|
284 | raw = "These \\[ \\] should be escaped, and so should \\\\ that. These " ' \\n are encoded.";
|
---|
285 | System.err.println("Raw: '" + raw + "'");
|
---|
286 | processed = transform(raw, transform);
|
---|
287 | System.err.println("Processed: '" + processed + "'");
|
---|
288 |
|
---|
289 | transform = "TEXT_TO_DOM";
|
---|
290 | System.err.println("Test " + transform);
|
---|
291 | raw = "A <\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
|
---|
292 | System.err.println("Raw: '" + raw + "'");
|
---|
293 | processed = transform(raw, transform);
|
---|
294 | System.err.println("Processed: '" + processed + "'");
|
---|
295 |
|
---|
296 | transform = "TEXT_TO_GREENSTONE";
|
---|
297 | System.err.println("Test " + transform);
|
---|
298 | raw = "These [ ] should be escaped, and so should \\ that. These \" \' \n are encoded.";
|
---|
299 | System.err.println("Raw: '" + raw + "'");
|
---|
300 | processed = transform(raw, transform);
|
---|
301 | System.err.println("Processed: '" + processed + "'");
|
---|
302 |
|
---|
303 | transform = "TEXT_TO_SHELL";
|
---|
304 | System.err.println("Test " + transform);
|
---|
305 | if(Utility.isWindows()) {
|
---|
306 | System.err.println("[Windows Version]");
|
---|
307 | transform = "TEXT_TO_SHELL_WINDOWS";
|
---|
308 | }
|
---|
309 | else {
|
---|
310 | System.err.println("[Unix Version]");
|
---|
311 | transform = "TEXT_TO_SHELL_UNIX";
|
---|
312 | }
|
---|
313 | raw = "A <\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
|
---|
314 | System.err.println("Raw: '" + raw + "'");
|
---|
315 | processed = transform(raw, transform);
|
---|
316 | System.err.println("Processed: '" + processed + "'");
|
---|
317 |
|
---|
318 | System.err.println("***** UNICODE TEST *****");
|
---|
319 | System.err.println("\\u0030 => " + transformUnicode("\\u0030"));
|
---|
320 | System.err.println("\\u0041 => " + transformUnicode("\\u0041"));
|
---|
321 | System.err.println("\\u007a => " + transformUnicode("\\u007a"));
|
---|
322 | System.err.println("\\u00e7 => " + transformUnicode("\\u00e7"));
|
---|
323 | System.err.println("0 => " + transformUnicode("0"));
|
---|
324 | System.err.println("A => " + transformUnicode("A"));
|
---|
325 | System.err.println("z => " + transformUnicode("z"));
|
---|
326 | System.err.println("ç => " + transformUnicode("ç"));
|
---|
327 | }
|
---|
328 | else {
|
---|
329 | System.err.println("Raw: '" + args[0] + "'");
|
---|
330 | System.err.println("Transform: " + args[1]);
|
---|
331 | String processed = transform(args[0], args[1]);
|
---|
332 | System.err.println("Processed: '" + processed + "'");
|
---|
333 | }
|
---|
334 | }
|
---|
335 | }
|
---|