source: trunk/gli/src/org/greenstone/gatherer/util/Codec.java@ 6069

Last change on this file since 6069 was 6069, checked in by jmt12, 20 years ago

Have rearranged where and how strings are feed through the Codec. After several hours work and a dozen paper trials I discovered the TEXT_TO_DOM conversion was completely pointless (DOM does it itself). Also the quotes only need to be dealt to if they are being sent to the collect.cfg file. Hopefully I've got it all going now - including using that pesky pipe character that I would rather not have to deal with. And everything seems to be ok - I tested all the dangerous characters including square brackets and amperstamp. I also tried hierarchies, and then as the piece'd'resistance I tried a hierarchies with dangerous characters. All good. I'm all about the working metadata.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.5 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * Author: John Thompson, Greenstone Digital Library, University of Waikato
9 *
10 * Copyright (C) 1999 New Zealand Digital Library Project
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *########################################################################
26 */
27package org.greenstone.gatherer.util;
28/*************************************************************************
29 * Written: 17-08-03
30 ************************************************************************/
31import java.util.*;
32import org.greenstone.gatherer.Gatherer;
33import org.greenstone.gatherer.util.Utility;
34/** Provides a standard, extensible way to convert from one format of string to another (given that each format has differing requirements regarding legal characters and escaped characters)
35 * @author John Thompson, Greenstone Digital Library, University of Waikato
36 * @version 2.3d
37 */
38public class Codec {
39
40 static final public String DECODE_PATH = "DECODE_PATH";
41 static final public String DECODE_SQUARE_BRACKETS = "DECODE_SQUARE_BRACKETS";
42 static final public String DOM_TO_GREENSTONE = "DOM_TO_GREENSTONE";
43 static final public String DOM_TO_TEXT = "DOM_TO_TEXT";
44 static final public String ENCODE_PATH = "ENCODE_PATH";
45 static final public String ENCODE_SQUARE_BRACKETS = "ENCODE_SQUARE_BRACKETS";
46 static final public String GREENSTONE_TO_DOM = "GREENSTONE_TO_DOM";
47 static final public String GREENSTONE_TO_TEXT = "GREENSTONE_TO_TEXT";
48 static final public String TEXT_TO_DOM = "TEXT_TO_DOM";
49 static final public String TEXT_TO_GREENSTONE = "TEXT_TO_GREENSTONE";
50 static final public String TEXT_TO_SHELL_UNIX = "TEXT_TO_SHELL_UNIX";
51 static final public String TEXT_TO_SHELL_WINDOWS = "TEXT_TO_SHELL_WINDOWS";
52
53 static final private int MAX_CACHE_SIZE = 100;
54
55 static private HashMap TRANSFORMS;
56 static private HashMap3D CACHE;
57
58 /** Static function called to construct TRANSFORMS mappings */
59 static {
60 TRANSFORMS = new HashMap();
61
62 String[] decode_path = {
63 "\\|", "\\\\",
64 "|", "\\|"
65 };
66 TRANSFORMS.put(DECODE_PATH, decode_path);
67 decode_path = null;
68
69 // Transform text into text, but without [ and ]
70 String[] decode_square_brackets = {
71 "[", "\\[",
72 "]", "\\]"
73 };
74 TRANSFORMS.put(DECODE_SQUARE_BRACKETS, decode_square_brackets);
75 decode_square_brackets = null;
76
77 // Translate DOM encoded text into Greenstone encoding
78 String[] dom_to_greenstone = {
79 "'", "\\\\\'",
80 ">", ">",
81 "&lt;", "<",
82 "&quot;", "\\\\\"",
83 "&amp;", "&"
84 };
85 // removed "\n", "\\\\n", - config files are allowed new lines
86 // added "\\|", "\\\\"
87
88 TRANSFORMS.put(DOM_TO_GREENSTONE, dom_to_greenstone);
89 dom_to_greenstone = null;
90
91 // Transform DOM encoded text into plain text
92 String[] dom_to_text = {
93 "&amp;#091;", "\\[",
94 "&amp;#093;", "\\]",
95 "&apos;", "\'",
96 "&gt;", ">",
97 "&lt;", "<",
98 "&quot;", "\"",
99 "&amp;", "&"
100 };
101 TRANSFORMS.put(DOM_TO_TEXT, dom_to_text);
102 dom_to_text = null;
103
104 String[] encode_path = {
105 "\\|", "&#124;",
106 "\\\\", "\\|"
107 };
108 TRANSFORMS.put(ENCODE_PATH, encode_path);
109 encode_path = null;
110
111 // Transform text into text, but without [ and ]
112 String[] encode_square_brackets = {
113 "\\[", "&#091;",
114 "\\]", "&#093;"
115 };
116 TRANSFORMS.put(ENCODE_SQUARE_BRACKETS, encode_square_brackets);
117 encode_square_brackets = null;
118
119 // Transform Greenstone encoded text to DOM encoding
120 String[] greenstone_to_dom = {
121 "&", "&amp;",
122 "<", "&lt;",
123 ">", "&gt;",
124 "\\\\\"", "&quot;",
125 "\\\\\'", "&apos;"
126 };
127 // removed"\\\\n", "\n", added "\\\\", "\\|"
128
129 TRANSFORMS.put(GREENSTONE_TO_DOM, greenstone_to_dom);
130 greenstone_to_dom = null;
131
132 // Transform Greenstone encoded text to plain text
133 String[] greenstone_to_text = {
134 "\\\\\"", "\"",
135 "\\\\\'", "\'",
136 "&quot;", "\"",
137 "&apos;", "\'",
138 "&#091;", "\\[",
139 "&#093;", "\\]"
140 };
141 // removed "\\\\n", "\n", "\\|", "\\\\"
142
143 TRANSFORMS.put(GREENSTONE_TO_TEXT, greenstone_to_text);
144 greenstone_to_text = null;
145
146 // Transform plain html text into something that can be placed in a DOM
147 String[] text_to_dom = {
148 "&", "&amp;",
149 "<", "&lt;",
150 ">", "&gt;",
151 "\"", "&quot;",
152 "\'", "&apos;"
153 };
154 TRANSFORMS.put(TEXT_TO_DOM, text_to_dom);
155 text_to_dom = null;
156
157 // Transform plain html text into greenstone encoding
158 String[] text_to_greenstone = {
159
160 "\\[", "&#091;",
161 "\\]", "&#093;",
162 "\"", "&quot;",
163 "\n", "\\\\n"
164 };
165 // "\'", "&apos;",
166 // removed "\\\\", "\\|",
167 TRANSFORMS.put(TEXT_TO_GREENSTONE, text_to_greenstone);
168 text_to_greenstone = null;
169
170 // Transform plain html text into something that can be placed in a shell command
171 String[] text_to_shell_unix = {
172 "\"", "\\\\\"",
173 "\'", "\\\\\'",
174 "\n", "\\\\n"
175 };
176 TRANSFORMS.put(TEXT_TO_SHELL_UNIX, text_to_shell_unix);
177 text_to_shell_unix = null;
178
179 // Transform plain html text into something that can be placed in a shell command. Windows requires twice as many escaped for speech marks to be passed to underlying processes
180 String[] text_to_shell_windows = {
181 "\"", "\\\\\\\\\\\\\"",
182 "\'", "\\\\\'",
183 "\n", "\\\\n"
184 };
185 TRANSFORMS.put(TEXT_TO_SHELL_WINDOWS, text_to_shell_windows);
186 text_to_shell_windows = null;
187
188 CACHE = new HashMap3D();
189 }
190
191 static public String transform(String raw, String transform) {
192 if(raw == null) {
193 return raw;
194 }
195 ///ystem.err.println("Transforming by "+transform+":\n" + raw);
196 String processed = (String) CACHE.get(transform, raw);
197 if(processed == null) {
198 processed = raw;
199 String[] transforms = (String[]) TRANSFORMS.get(transform);
200 if(transforms != null) {
201 for(int i = 0; i < transforms.length; i = i + 2) {
202 String target = transforms[i];
203 String result = transforms[i+1];
204 processed = processed.replaceAll(target, result);
205 }
206 }
207 //Gatherer.println("\n*** Transform: " + transform + " ***");
208 //Gatherer.println("*** Raw : '" + raw + "'");
209 //Gatherer.println("*** Processed: '" + processed + "'");
210 // If cache is at maximum size, empty it and start again
211 if(CACHE.size() == MAX_CACHE_SIZE) {
212 CACHE.clear();
213 }
214 CACHE.put(transform, raw, processed);
215 }
216 return processed;
217 }
218
219 /** Transform either of the accepted unicode escape sequences styles from in the string into single characters */
220 static final private char AND_CHAR = '&';
221 static final private char ESCAPE_CHAR = '\\';
222 static final private char HASH_CHAR = '#';
223 static final private char LOWER_U_CHAR = 'u';
224 static final private char UPPER_U_CHAR = 'U';
225 static final private char SEMICOLON_CHAR = ';';
226
227 static public String transformUnicode(String raw) {
228 StringBuffer processed = new StringBuffer();
229 int index = 0;
230 int raw_length = raw.length();
231 while(index < raw_length) {
232 char c0 = raw.charAt(index);
233 switch(c0) {
234 case AND_CHAR:
235 if(index + 1 < raw_length) {
236 // First the HTML &#231; type
237 char c1 = raw.charAt(index + 1);
238 if(c1 == HASH_CHAR) {
239 StringBuffer number_str = new StringBuffer();
240 char c2;
241 int offset = 2;
242 while(index + offset < raw_length && (c2 = raw.charAt(index + offset)) != SEMICOLON_CHAR) {
243 number_str.append(c2);
244 offset++;
245 }
246 // We've either run out of characters or have parsed a number
247 if(index + offset < raw_length && raw.charAt(index + offset) == SEMICOLON_CHAR) {
248 int number = Integer.parseInt(number_str.toString());
249 processed.append((char)number);
250 index = index + offset;
251 number_str = null;
252 break;
253 }
254 number_str = null;
255 }
256 }
257 processed.append(c0);
258 break;
259 case ESCAPE_CHAR:
260 // Now the \u00e7 type
261 if(index + 1 < raw_length) {
262 char c3 = raw.charAt(index + 1);
263 if((c3 == UPPER_U_CHAR || c3 == LOWER_U_CHAR) && index + 5 < raw_length) {
264 // We read four digits
265 String hex_str = raw.substring(index + 2, index + 6);
266 int number = Integer.parseInt(hex_str, 16);
267 hex_str = null;
268 processed.append((char)number);
269 index = index + 5;
270 break;
271 }
272 }
273 processed.append(c0);
274 break;
275 default:
276 processed.append(c0);
277 }
278 index++;
279 }
280 return processed.toString();
281 }
282
283 static public void main(String[] args) {
284 if(args.length < 2) {
285 String processed;
286 String raw;
287 String transform;
288
289 System.err.println("Running Test Suite");
290
291 transform = "DOM_TO_GREENSTONE";
292 System.err.println("Test " + transform);
293 raw = "A &amp;lt;\nand a &lt;a href=&quot;here.html&quot;&gt;&lt;font size=&apos;2&apos;&gt;URL&lt;/font&gt;&lt;/a&gt;";
294 System.err.println("Raw: '" + raw + "'");
295 processed = transform(raw, transform);
296 System.err.println("Processed: '" + processed + "'");
297
298 transform = "DOM_TO_TEXT";
299 System.err.println("Test " + transform);
300 raw = "A &amp;lt;\nand a &lt;a href=&quot;here.html&quot;&gt;&lt;font size=&apos;2&apos;&gt;URL&lt;/font&gt;&lt;/a&gt;";
301 System.err.println("Raw: '" + raw + "'");
302 processed = transform(raw, transform);
303 System.err.println("Processed: '" + processed + "'");
304
305 transform = "GREENSTONE_TO_DOM";
306 System.err.println("Test " + transform);
307 raw = "A &lt;\\nand a <a href=\\\"here.html\\\"><font size=\\\'2\\\'URL</font></a>";
308 System.err.println("Raw: '" + raw + "'");
309 processed = transform(raw, transform);
310 System.err.println("Processed: '" + processed + "'");
311
312 transform = "GREENSTONE_TO_TEXT";
313 System.err.println("Test " + transform);
314 raw = "These \\[ \\] should be escaped, and so should \\\\ that. These &quot; &apos; \\n are encoded.";
315 System.err.println("Raw: '" + raw + "'");
316 processed = transform(raw, transform);
317 System.err.println("Processed: '" + processed + "'");
318
319 transform = "TEXT_TO_DOM";
320 System.err.println("Test " + transform);
321 raw = "A &lt;\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
322 System.err.println("Raw: '" + raw + "'");
323 processed = transform(raw, transform);
324 System.err.println("Processed: '" + processed + "'");
325
326 transform = "TEXT_TO_GREENSTONE";
327 System.err.println("Test " + transform);
328 raw = "These [ ] should be escaped, and so should \\ that. These \" \' \n are encoded.";
329 System.err.println("Raw: '" + raw + "'");
330 processed = transform(raw, transform);
331 System.err.println("Processed: '" + processed + "'");
332
333 transform = "TEXT_TO_SHELL";
334 System.err.println("Test " + transform);
335 if(Utility.isWindows()) {
336 System.err.println("[Windows Version]");
337 transform = "TEXT_TO_SHELL_WINDOWS";
338 }
339 else {
340 System.err.println("[Unix Version]");
341 transform = "TEXT_TO_SHELL_UNIX";
342 }
343 raw = "A &lt;\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
344 System.err.println("Raw: '" + raw + "'");
345 processed = transform(raw, transform);
346 System.err.println("Processed: '" + processed + "'");
347
348 System.err.println("***** UNICODE TEST *****");
349 System.err.println("\\u0030 => " + transformUnicode("\\u0030"));
350 System.err.println("\\u0041 => " + transformUnicode("\\u0041"));
351 System.err.println("\\u007a => " + transformUnicode("\\u007a"));
352 System.err.println("\\u00e7 => " + transformUnicode("\\u00e7"));
353 System.err.println("&#48; => " + transformUnicode("&#48;"));
354 System.err.println("&#65; => " + transformUnicode("&#65;"));
355 System.err.println("&#122; => " + transformUnicode("&#122;"));
356 System.err.println("&#231; => " + transformUnicode("&#231;"));
357 }
358 else {
359 System.err.println("Raw: '" + args[0] + "'");
360 System.err.println("Transform: " + args[1]);
361 String processed = transform(args[0], args[1]);
362 System.err.println("Processed: '" + processed + "'");
363 }
364 }
365}
Note: See TracBrowser for help on using the repository browser.