source: trunk/gli/src/org/greenstone/gatherer/util/Codec.java@ 6828

Last change on this file since 6828 was 6828, checked in by mdewsnip, 20 years ago

Added a transformation from plain text to a regular expression that matches it. This involves escaping special regular expression characters such as \().[]{}.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.9 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * Author: John Thompson, Greenstone Digital Library, University of Waikato
9 *
10 * Copyright (C) 1999 New Zealand Digital Library Project
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *########################################################################
26 */
27package org.greenstone.gatherer.util;
28/*************************************************************************
29 * Written: 17-08-03
30 ************************************************************************/
31import java.util.*;
32import org.greenstone.gatherer.Gatherer;
33import org.greenstone.gatherer.util.Utility;
34/** Provides a standard, extensible way to convert from one format of string to another (given that each format has differing requirements regarding legal characters and escaped characters)
35 * @author John Thompson, Greenstone Digital Library, University of Waikato
36 * @version 2.3d
37 */
38public class Codec {
39
40 static final public String DECODE_PATH = "DECODE_PATH";
41 static final public String DECODE_SQUARE_BRACKETS = "DECODE_SQUARE_BRACKETS";
42 static final public String DOM_TO_GREENSTONE = "DOM_TO_GREENSTONE";
43 static final public String DOM_TO_TEXT = "DOM_TO_TEXT";
44 static final public String ENCODE_PATH = "ENCODE_PATH";
45 static final public String ENCODE_SQUARE_BRACKETS = "ENCODE_SQUARE_BRACKETS";
46 static final public String GREENSTONE_TO_DOM = "GREENSTONE_TO_DOM";
47 static final public String GREENSTONE_TO_TEXT = "GREENSTONE_TO_TEXT";
48 static final public String TEXT_TO_DOM = "TEXT_TO_DOM";
49 static final public String TEXT_TO_GREENSTONE = "TEXT_TO_GREENSTONE";
50 static final public String TEXT_TO_REGEXP = "TEXT_TO_REGEXP";
51 static final public String TEXT_TO_SHELL_UNIX = "TEXT_TO_SHELL_UNIX";
52 static final public String TEXT_TO_SHELL_WINDOWS = "TEXT_TO_SHELL_WINDOWS";
53
54 static final private int MAX_CACHE_SIZE = 100;
55
56 static private HashMap TRANSFORMS;
57 static private HashMap3D CACHE;
58
59 /** Static function called to construct TRANSFORMS mappings */
60 static {
61 TRANSFORMS = new HashMap();
62
63 String[] decode_path = {
64 "\\|", "\\\\",
65 "|", "\\|"
66 };
67 TRANSFORMS.put(DECODE_PATH, decode_path);
68 decode_path = null;
69
70 // Transform text into text, but without [ and ]
71 String[] decode_square_brackets = {
72 "[", "\\[",
73 "]", "\\]"
74 };
75 TRANSFORMS.put(DECODE_SQUARE_BRACKETS, decode_square_brackets);
76 decode_square_brackets = null;
77
78 // Translate DOM encoded text into Greenstone encoding
79 String[] dom_to_greenstone = {
80 "'", "\\\\\'",
81 ">", ">",
82 "&lt;", "<",
83 "&quot;", "\\\\\"",
84 "&amp;", "&"
85 };
86 // removed "\n", "\\\\n", - config files are allowed new lines
87 // added "\\|", "\\\\"
88
89 TRANSFORMS.put(DOM_TO_GREENSTONE, dom_to_greenstone);
90 dom_to_greenstone = null;
91
92 // Transform DOM encoded text into plain text
93 String[] dom_to_text = {
94 "&amp;#091;", "\\[",
95 "&amp;#093;", "\\]",
96 "&apos;", "\'",
97 "&gt;", ">",
98 "&lt;", "<",
99 "&quot;", "\"",
100 "&amp;", "&"
101 };
102 TRANSFORMS.put(DOM_TO_TEXT, dom_to_text);
103 dom_to_text = null;
104
105 // Transform text into a regular expression that will match it
106 String[] text_to_regexp = {
107 "\\\\", "\\\\\\\\",
108 "\\(", "\\\\(",
109 "\\)", "\\\\)",
110 "\\[", "\\\\[",
111 "\\]", "\\\\]",
112 "\\{", "\\\\{",
113 "\\}", "\\\\}",
114 "\\.", "\\\\."
115 };
116 TRANSFORMS.put(TEXT_TO_REGEXP, text_to_regexp);
117 text_to_regexp = null;
118
119 String[] encode_path = {
120 "\\|", "&#124;",
121 "\\\\", "\\|"
122 };
123 TRANSFORMS.put(ENCODE_PATH, encode_path);
124 encode_path = null;
125
126 // Transform text into text, but without [ and ]
127 String[] encode_square_brackets = {
128 "\\[", "&#091;",
129 "\\]", "&#093;"
130 };
131 TRANSFORMS.put(ENCODE_SQUARE_BRACKETS, encode_square_brackets);
132 encode_square_brackets = null;
133
134 // Transform Greenstone encoded text to DOM encoding
135 String[] greenstone_to_dom = {
136 "&", "&amp;",
137 "<", "&lt;",
138 ">", "&gt;",
139 "\\\\\"", "&quot;",
140 "\\\\\'", "&apos;",
141 "\"", "&quot;",
142 "\'", "&apos;"
143 };
144 // removed"\\\\n", "\n", added "\\\\", "\\|"
145
146 TRANSFORMS.put(GREENSTONE_TO_DOM, greenstone_to_dom);
147 greenstone_to_dom = null;
148
149 // Transform Greenstone encoded text to plain text
150 String[] greenstone_to_text = {
151 "\\\\\"", "\"",
152 "\\\\\'", "\'",
153 "&quot;", "\"",
154 "&apos;", "\'",
155 "&#091;", "\\[",
156 "&#093;", "\\]"
157 };
158 // removed "\\\\n", "\n", "\\|", "\\\\"
159
160 TRANSFORMS.put(GREENSTONE_TO_TEXT, greenstone_to_text);
161 greenstone_to_text = null;
162
163 // Transform plain html text into something that can be placed in a DOM
164 String[] text_to_dom = {
165 "&", "&amp;",
166 "<", "&lt;",
167 ">", "&gt;",
168 "\"", "&quot;",
169 "\'", "&apos;"
170 };
171 TRANSFORMS.put(TEXT_TO_DOM, text_to_dom);
172 text_to_dom = null;
173
174 // Transform plain html text into greenstone encoding
175 String[] text_to_greenstone = {
176
177 "\\[", "&#091;",
178 "\\]", "&#093;",
179 "\"", "&quot;",
180 "\n", "\\\\n"
181 };
182 // "\'", "&apos;",
183 // removed "\\\\", "\\|",
184 TRANSFORMS.put(TEXT_TO_GREENSTONE, text_to_greenstone);
185 text_to_greenstone = null;
186
187 // Transform plain html text into something that can be placed in a shell command
188 String[] text_to_shell_unix = {
189 "\"", "\\\\\"",
190 "\'", "\\\\\'",
191 "\n", "\\\\n"
192 };
193 TRANSFORMS.put(TEXT_TO_SHELL_UNIX, text_to_shell_unix);
194 text_to_shell_unix = null;
195
196 // Transform plain html text into something that can be placed in a shell command. Windows requires twice as many escaped for speech marks to be passed to underlying processes
197 String[] text_to_shell_windows = {
198 "\"", "\\\\\\\\\\\\\"",
199 "\'", "\\\\\'",
200 "\n", "\\\\n"
201 };
202 TRANSFORMS.put(TEXT_TO_SHELL_WINDOWS, text_to_shell_windows);
203 text_to_shell_windows = null;
204
205 CACHE = new HashMap3D();
206 }
207
208 static public String transform(String raw, String transform) {
209 if(raw == null) {
210 return raw;
211 }
212 // System.err.println("Transforming by "+transform+":\n" + raw);
213 String processed = (String) CACHE.get(transform, raw);
214 if(processed == null) {
215 processed = raw;
216 String[] transforms = (String[]) TRANSFORMS.get(transform);
217 if(transforms != null) {
218 for(int i = 0; i < transforms.length; i = i + 2) {
219 String target = transforms[i];
220 String result = transforms[i+1];
221 processed = processed.replaceAll(target, result);
222 }
223 }
224 Gatherer.println("\n*** Transform: " + transform + " ***");
225 Gatherer.println("*** Raw : '" + raw + "'");
226 Gatherer.println("*** Processed: '" + processed + "'");
227 // If cache is at maximum size, empty it and start again
228 if(CACHE.size() == MAX_CACHE_SIZE) {
229 CACHE.clear();
230 }
231 CACHE.put(transform, raw, processed);
232 }
233 return processed;
234 }
235
236 /** Transform either of the accepted unicode escape sequences styles from in the string into single characters */
237 static final private char AND_CHAR = '&';
238 static final private char ESCAPE_CHAR = '\\';
239 static final private char HASH_CHAR = '#';
240 static final private char LOWER_U_CHAR = 'u';
241 static final private char UPPER_U_CHAR = 'U';
242 static final private char SEMICOLON_CHAR = ';';
243
244 static public String transformUnicode(String raw) {
245 StringBuffer processed = new StringBuffer();
246 int index = 0;
247 int raw_length = raw.length();
248 while(index < raw_length) {
249 char c0 = raw.charAt(index);
250 switch(c0) {
251 case AND_CHAR:
252 if(index + 1 < raw_length) {
253 // First the HTML &#231; type
254 char c1 = raw.charAt(index + 1);
255 if(c1 == HASH_CHAR) {
256 StringBuffer number_str = new StringBuffer();
257 char c2;
258 int offset = 2;
259 while(index + offset < raw_length && (c2 = raw.charAt(index + offset)) != SEMICOLON_CHAR) {
260 number_str.append(c2);
261 offset++;
262 }
263 // We've either run out of characters or have parsed a number
264 if(index + offset < raw_length && raw.charAt(index + offset) == SEMICOLON_CHAR) {
265 int number = Integer.parseInt(number_str.toString());
266 processed.append((char)number);
267 index = index + offset;
268 number_str = null;
269 break;
270 }
271 number_str = null;
272 }
273 }
274 processed.append(c0);
275 break;
276 case ESCAPE_CHAR:
277 // Now the \u00e7 type
278 if(index + 1 < raw_length) {
279 char c3 = raw.charAt(index + 1);
280 if((c3 == UPPER_U_CHAR || c3 == LOWER_U_CHAR) && index + 5 < raw_length) {
281 // We read four digits
282 String hex_str = raw.substring(index + 2, index + 6);
283 int number = Integer.parseInt(hex_str, 16);
284 hex_str = null;
285 processed.append((char)number);
286 index = index + 5;
287 break;
288 }
289 }
290 processed.append(c0);
291 break;
292 default:
293 processed.append(c0);
294 }
295 index++;
296 }
297 return processed.toString();
298 }
299
300 static public void main(String[] args) {
301 if(args.length < 2) {
302 String processed;
303 String raw;
304 String transform;
305
306 System.err.println("Running Test Suite");
307
308 transform = "DOM_TO_GREENSTONE";
309 System.err.println("Test " + transform);
310 raw = "A &amp;lt;\nand a &lt;a href=&quot;here.html&quot;&gt;&lt;font size=&apos;2&apos;&gt;URL&lt;/font&gt;&lt;/a&gt;";
311 System.err.println("Raw: '" + raw + "'");
312 processed = transform(raw, transform);
313 System.err.println("Processed: '" + processed + "'");
314
315 transform = "DOM_TO_TEXT";
316 System.err.println("Test " + transform);
317 raw = "A &amp;lt;\nand a &lt;a href=&quot;here.html&quot;&gt;&lt;font size=&apos;2&apos;&gt;URL&lt;/font&gt;&lt;/a&gt;";
318 System.err.println("Raw: '" + raw + "'");
319 processed = transform(raw, transform);
320 System.err.println("Processed: '" + processed + "'");
321
322 transform = "GREENSTONE_TO_DOM";
323 System.err.println("Test " + transform);
324 raw = "A &lt;\\nand a <a href=\\\"here.html\\\"><font size=\\\'2\\\'URL</font></a>";
325 System.err.println("Raw: '" + raw + "'");
326 processed = transform(raw, transform);
327 System.err.println("Processed: '" + processed + "'");
328
329 transform = "GREENSTONE_TO_TEXT";
330 System.err.println("Test " + transform);
331 raw = "These \\[ \\] should be escaped, and so should \\\\ that. These &quot; &apos; \\n are encoded.";
332 System.err.println("Raw: '" + raw + "'");
333 processed = transform(raw, transform);
334 System.err.println("Processed: '" + processed + "'");
335
336 transform = "TEXT_TO_DOM";
337 System.err.println("Test " + transform);
338 raw = "A &lt;\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
339 System.err.println("Raw: '" + raw + "'");
340 processed = transform(raw, transform);
341 System.err.println("Processed: '" + processed + "'");
342
343 transform = "TEXT_TO_GREENSTONE";
344 System.err.println("Test " + transform);
345 raw = "These [ ] should be escaped, and so should \\ that. These \" \' \n are encoded.";
346 System.err.println("Raw: '" + raw + "'");
347 processed = transform(raw, transform);
348 System.err.println("Processed: '" + processed + "'");
349
350 transform = "TEXT_TO_SHELL";
351 System.err.println("Test " + transform);
352 if(Utility.isWindows()) {
353 System.err.println("[Windows Version]");
354 transform = "TEXT_TO_SHELL_WINDOWS";
355 }
356 else {
357 System.err.println("[Unix Version]");
358 transform = "TEXT_TO_SHELL_UNIX";
359 }
360 raw = "A &lt;\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
361 System.err.println("Raw: '" + raw + "'");
362 processed = transform(raw, transform);
363 System.err.println("Processed: '" + processed + "'");
364
365 System.err.println("***** UNICODE TEST *****");
366 System.err.println("\\u0030 => " + transformUnicode("\\u0030"));
367 System.err.println("\\u0041 => " + transformUnicode("\\u0041"));
368 System.err.println("\\u007a => " + transformUnicode("\\u007a"));
369 System.err.println("\\u00e7 => " + transformUnicode("\\u00e7"));
370 System.err.println("&#48; => " + transformUnicode("&#48;"));
371 System.err.println("&#65; => " + transformUnicode("&#65;"));
372 System.err.println("&#122; => " + transformUnicode("&#122;"));
373 System.err.println("&#231; => " + transformUnicode("&#231;"));
374 }
375 else {
376 System.err.println("Raw: '" + args[0] + "'");
377 System.err.println("Transform: " + args[1]);
378 String processed = transform(args[0], args[1]);
379 System.err.println("Processed: '" + processed + "'");
380 }
381 }
382}
Note: See TracBrowser for help on using the repository browser.