source: trunk/gli/src/org/greenstone/gatherer/util/Codec.java@ 6051

Last change on this file since 6051 was 6051, checked in by jmt12, 20 years ago

Here is the result of sixteen hours work over the weekend. I'm too tired to comment them all separately, but here are some of the highlights:
Rewrote how the 'base on collection' method actually retrieves and updates the collection configuration - ensuring the CDM.CollectionConfiguration class is used instead of the retarded Collection.CollectionConfiguration (which coincidently has had a name change to BasicCollectionConfiguration). Went through code search for places where the two versions had been confused. Rewrote large swathes of GDMDocument so as to differentiate between normal and extracted metadata - an attempt to prevent the snowballing extracted metadata problem. Fixed problem where GLI was correctly recieving the last few lines of an external process. The collection shortname is no longer visible, nor is the confusing double name for metadata elements. Also coloured folders in the trees are kaput. The users email is now saved as part of the GLI configuration and is used as appropriate to fill out collection fields. There are new options on the right click menus over trees to allow the expansion and collapsing of folders. 'Show Files' now shows all types (or at least 6 types) of image properly (arg, the plagues of copy and paste). 'Based On' collections are public, plugin list automatically moves to next entry if plugin removed (I guess we should do the same in every other screen?) and metadata arguments in plugins/classifiers are no longer editable. There are about a dozen other small things, but I can't remember them. Hope I remembered to set all of the files to UNIX line-endings.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.1 KB
Line 
1/**
2 *#########################################################################
3 *
4 * A component of the Gatherer application, part of the Greenstone digital
5 * library suite from the New Zealand Digital Library Project at the
6 * University of Waikato, New Zealand.
7 *
8 * Author: John Thompson, Greenstone Digital Library, University of Waikato
9 *
10 * Copyright (C) 1999 New Zealand Digital Library Project
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 *########################################################################
26 */
27package org.greenstone.gatherer.util;
28/*************************************************************************
29 * Written: 17-08-03
30 ************************************************************************/
31import java.util.*;
32import org.greenstone.gatherer.Gatherer;
33import org.greenstone.gatherer.util.Utility;
34/** Provides a standard, extensible way to convert from one format of string to another (given that each format has differing requirements regarding legal characters and escaped characters)
35 * @author John Thompson, Greenstone Digital Library, University of Waikato
36 * @version 2.3d
37 */
38public class Codec {
39
40 static final public String DECODE_PATH = "DECODE_PATH";
41 static final public String DOM_TO_GREENSTONE = "DOM_TO_GREENSTONE";
42 static final public String DOM_TO_TEXT = "DOM_TO_TEXT";
43 static final public String ENCODE_PATH = "ENCODE_PATH";
44 static final public String GREENSTONE_TO_DOM = "GREENSTONE_TO_DOM";
45 static final public String GREENSTONE_TO_TEXT = "GREENSTONE_TO_TEXT";
46 static final public String REMOVE_SQUARE_BRACKET = "REMOVE_SQUARE_BRACKET";
47 static final public String TEXT_TO_DOM = "TEXT_TO_DOM";
48 static final public String TEXT_TO_GREENSTONE = "TEXT_TO_GREENSTONE";
49 static final public String TEXT_TO_SHELL_UNIX = "TEXT_TO_SHELL_UNIX";
50 static final public String TEXT_TO_SHELL_WINDOWS = "TEXT_TO_SHELL_WINDOWS";
51
52 static final private int MAX_CACHE_SIZE = 100;
53
54 static private HashMap TRANSFORMS;
55 static private HashMap3D CACHE;
56
57
58 /** Static function called to construct TRANSFORMS mappings */
59 static {
60 TRANSFORMS = new HashMap();
61
62 String[] decode_path = {
63 "\\|", "\\\\"
64 };
65 TRANSFORMS.put(DECODE_PATH, decode_path);
66 decode_path = null;
67
68 // Translate DOM encoded text into Greenstone encoding
69 String[] dom_to_greenstone = {
70 "'", "\\\\\'",
71 ">", ">",
72 "&lt;", "<",
73 "&quot;", "\\\\\"",
74 "&amp;", "&"
75 };
76 // removed "\n", "\\\\n", - config files are allowed new lines
77 // added "\\|", "\\\\"
78
79 TRANSFORMS.put(DOM_TO_GREENSTONE, dom_to_greenstone);
80 dom_to_greenstone = null;
81
82 // Transform DOM encoded text into plain text
83 String[] dom_to_text = {
84 "&amp;#091;", "\\[",
85 "&amp;#093;", "\\]",
86 "&apos;", "\'",
87 "&gt;", ">",
88 "&lt;", "<",
89 "&quot;", "\"",
90 "&amp;", "&"
91 };
92 TRANSFORMS.put(DOM_TO_TEXT, dom_to_text);
93 dom_to_text = null;
94
95 String[] encode_path = {
96 "\\\\", "\\|"
97 };
98 TRANSFORMS.put(ENCODE_PATH, encode_path);
99 encode_path = null;
100
101 // Transform Greenstone encoded text to DOM encoding
102 String[] greenstone_to_dom = {
103 "&", "&amp;",
104 "<", "&lt;",
105 ">", "&gt;",
106 "\\\\\"", "&quot;",
107 "\\\\\'", "&apos;"
108 };
109 // removed"\\\\n", "\n", added "\\\\", "\\|"
110
111 TRANSFORMS.put(GREENSTONE_TO_DOM, greenstone_to_dom);
112 greenstone_to_dom = null;
113
114 // Transform Greenstone encoded text to plain text
115 String[] greenstone_to_text = {
116 "\\\\\"", "\"",
117 "\\\\\'", "\'",
118 "&quot;", "\"",
119 "&apos;", "\'",
120 "&#091;", "\\[",
121 "&#093;", "\\]"
122 };
123 // removed "\\\\n", "\n", "\\|", "\\\\"
124
125 TRANSFORMS.put(GREENSTONE_TO_TEXT, greenstone_to_text);
126 greenstone_to_text = null;
127
128 // Transform text into text, but without [ and ]
129 String[] remove_square_bracket = {
130 "\\[", "&amp;#091;",
131 "\\]", "&amp;#093;"
132 };
133 TRANSFORMS.put(REMOVE_SQUARE_BRACKET, remove_square_bracket);
134 remove_square_bracket = null;
135
136 // Transform plain html text into something that can be placed in a DOM
137 String[] text_to_dom = {
138 "&", "&amp;",
139 "<", "&lt;",
140 ">", "&gt;",
141 "\"", "&quot;",
142 "\'", "&apos;"
143 };
144 TRANSFORMS.put(TEXT_TO_DOM, text_to_dom);
145 text_to_dom = null;
146
147 // Transform plain html text into greenstone encoding
148 String[] text_to_greenstone = {
149
150 "\\[", "&#091;",
151 "\\]", "&#093;",
152 "\"", "&quot;",
153 "\n", "\\\\n"
154 };
155 // "\'", "&apos;",
156 // removed "\\\\", "\\|",
157 TRANSFORMS.put(TEXT_TO_GREENSTONE, text_to_greenstone);
158 text_to_greenstone = null;
159
160 // Transform plain html text into something that can be placed in a shell command
161 String[] text_to_shell_unix = {
162 "\"", "\\\\\"",
163 "\'", "\\\\\'",
164 "\n", "\\\\n"
165 };
166 TRANSFORMS.put(TEXT_TO_SHELL_UNIX, text_to_shell_unix);
167 text_to_shell_unix = null;
168
169 // Transform plain html text into something that can be placed in a shell command. Windows requires twice as many escaped for speech marks to be passed to underlying processes
170 String[] text_to_shell_windows = {
171 "\"", "\\\\\\\\\\\\\"",
172 "\'", "\\\\\'",
173 "\n", "\\\\n"
174 };
175 TRANSFORMS.put(TEXT_TO_SHELL_WINDOWS, text_to_shell_windows);
176 text_to_shell_windows = null;
177
178 CACHE = new HashMap3D();
179 }
180
181 static public String transform(String raw, String transform) {
182 if(raw == null) {
183 return raw;
184 }
185 ///ystem.err.println("Transforming by "+transform+":\n" + raw);
186 String processed = (String) CACHE.get(transform, raw);
187 if(processed == null) {
188 processed = raw;
189 String[] transforms = (String[]) TRANSFORMS.get(transform);
190 if(transforms != null) {
191 for(int i = 0; i < transforms.length; i = i + 2) {
192 String target = transforms[i];
193 String result = transforms[i+1];
194 processed = processed.replaceAll(target, result);
195 }
196 }
197 //Gatherer.println("\n*** Transform: " + transform + " ***");
198 //Gatherer.println("*** Raw : '" + raw + "'");
199 //Gatherer.println("*** Processed: '" + processed + "'");
200 // If cache is at maximum size, empty it and start again
201 if(CACHE.size() == MAX_CACHE_SIZE) {
202 CACHE.clear();
203 }
204 CACHE.put(transform, raw, processed);
205 }
206 return processed;
207 }
208
209 /** Transform either of the accepted unicode escape sequences styles from in the string into single characters */
210 static final private char AND_CHAR = '&';
211 static final private char ESCAPE_CHAR = '\\';
212 static final private char HASH_CHAR = '#';
213 static final private char LOWER_U_CHAR = 'u';
214 static final private char UPPER_U_CHAR = 'U';
215 static final private char SEMICOLON_CHAR = ';';
216
217 static public String transformUnicode(String raw) {
218 StringBuffer processed = new StringBuffer();
219 int index = 0;
220 int raw_length = raw.length();
221 while(index < raw_length) {
222 char c0 = raw.charAt(index);
223 switch(c0) {
224 case AND_CHAR:
225 if(index + 1 < raw_length) {
226 // First the HTML &#231; type
227 char c1 = raw.charAt(index + 1);
228 if(c1 == HASH_CHAR) {
229 StringBuffer number_str = new StringBuffer();
230 char c2;
231 int offset = 2;
232 while(index + offset < raw_length && (c2 = raw.charAt(index + offset)) != SEMICOLON_CHAR) {
233 number_str.append(c2);
234 offset++;
235 }
236 // We've either run out of characters or have parsed a number
237 if(index + offset < raw_length && raw.charAt(index + offset) == SEMICOLON_CHAR) {
238 int number = Integer.parseInt(number_str.toString());
239 processed.append((char)number);
240 index = index + offset;
241 number_str = null;
242 break;
243 }
244 number_str = null;
245 }
246 }
247 processed.append(c0);
248 break;
249 case ESCAPE_CHAR:
250 // Now the \u00e7 type
251 if(index + 1 < raw_length) {
252 char c3 = raw.charAt(index + 1);
253 if((c3 == UPPER_U_CHAR || c3 == LOWER_U_CHAR) && index + 5 < raw_length) {
254 // We read four digits
255 String hex_str = raw.substring(index + 2, index + 6);
256 int number = Integer.parseInt(hex_str, 16);
257 hex_str = null;
258 processed.append((char)number);
259 index = index + 5;
260 break;
261 }
262 }
263 processed.append(c0);
264 break;
265 default:
266 processed.append(c0);
267 }
268 index++;
269 }
270 return processed.toString();
271 }
272
273 static public void main(String[] args) {
274 if(args.length < 2) {
275 String processed;
276 String raw;
277 String transform;
278
279 System.err.println("Running Test Suite");
280
281 transform = "DOM_TO_GREENSTONE";
282 System.err.println("Test " + transform);
283 raw = "A &amp;lt;\nand a &lt;a href=&quot;here.html&quot;&gt;&lt;font size=&apos;2&apos;&gt;URL&lt;/font&gt;&lt;/a&gt;";
284 System.err.println("Raw: '" + raw + "'");
285 processed = transform(raw, transform);
286 System.err.println("Processed: '" + processed + "'");
287
288 transform = "DOM_TO_TEXT";
289 System.err.println("Test " + transform);
290 raw = "A &amp;lt;\nand a &lt;a href=&quot;here.html&quot;&gt;&lt;font size=&apos;2&apos;&gt;URL&lt;/font&gt;&lt;/a&gt;";
291 System.err.println("Raw: '" + raw + "'");
292 processed = transform(raw, transform);
293 System.err.println("Processed: '" + processed + "'");
294
295 transform = "GREENSTONE_TO_DOM";
296 System.err.println("Test " + transform);
297 raw = "A &lt;\\nand a <a href=\\\"here.html\\\"><font size=\\\'2\\\'URL</font></a>";
298 System.err.println("Raw: '" + raw + "'");
299 processed = transform(raw, transform);
300 System.err.println("Processed: '" + processed + "'");
301
302 transform = "GREENSTONE_TO_TEXT";
303 System.err.println("Test " + transform);
304 raw = "These \\[ \\] should be escaped, and so should \\\\ that. These &quot; &apos; \\n are encoded.";
305 System.err.println("Raw: '" + raw + "'");
306 processed = transform(raw, transform);
307 System.err.println("Processed: '" + processed + "'");
308
309 transform = "TEXT_TO_DOM";
310 System.err.println("Test " + transform);
311 raw = "A &lt;\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
312 System.err.println("Raw: '" + raw + "'");
313 processed = transform(raw, transform);
314 System.err.println("Processed: '" + processed + "'");
315
316 transform = "TEXT_TO_GREENSTONE";
317 System.err.println("Test " + transform);
318 raw = "These [ ] should be escaped, and so should \\ that. These \" \' \n are encoded.";
319 System.err.println("Raw: '" + raw + "'");
320 processed = transform(raw, transform);
321 System.err.println("Processed: '" + processed + "'");
322
323 transform = "TEXT_TO_SHELL";
324 System.err.println("Test " + transform);
325 if(Utility.isWindows()) {
326 System.err.println("[Windows Version]");
327 transform = "TEXT_TO_SHELL_WINDOWS";
328 }
329 else {
330 System.err.println("[Unix Version]");
331 transform = "TEXT_TO_SHELL_UNIX";
332 }
333 raw = "A &lt;\nand a <a href=\"here.html\"><font size='2'>URL</font></a>";
334 System.err.println("Raw: '" + raw + "'");
335 processed = transform(raw, transform);
336 System.err.println("Processed: '" + processed + "'");
337
338 System.err.println("***** UNICODE TEST *****");
339 System.err.println("\\u0030 => " + transformUnicode("\\u0030"));
340 System.err.println("\\u0041 => " + transformUnicode("\\u0041"));
341 System.err.println("\\u007a => " + transformUnicode("\\u007a"));
342 System.err.println("\\u00e7 => " + transformUnicode("\\u00e7"));
343 System.err.println("&#48; => " + transformUnicode("&#48;"));
344 System.err.println("&#65; => " + transformUnicode("&#65;"));
345 System.err.println("&#122; => " + transformUnicode("&#122;"));
346 System.err.println("&#231; => " + transformUnicode("&#231;"));
347 }
348 else {
349 System.err.println("Raw: '" + args[0] + "'");
350 System.err.println("Transform: " + args[1]);
351 String processed = transform(args[0], args[1]);
352 System.err.println("Processed: '" + processed + "'");
353 }
354 }
355}
Note: See TracBrowser for help on using the repository browser.