source: test-collections/trunk/filename-encodings/bin/script/DirList2.java@ 23830

Last change on this file since 23830 was 23334, checked in by ak19, 13 years ago

Additional comments and usage printed out if args is less than 1.

  • Property svn:executable set to *
File size: 4.1 KB
Line 
1import java.io.*;
2import java.lang.*;
3import java.net.*;
4
5public class DirList
6{
7
8 public static String raw_filename_to_url_encoded(String fileName)
9 {
10 String urlEncoded = "";
11 try {
12 byte[] bytes = fileName.getBytes();
13
14 for(int i = 0; i < bytes.length; i++) {
15 // mask each byte (by applying & 0xFF) to make the signed
16 // byte (in the range -128 to 127) unsigned (in the range
17 // 0 to 255).
18
19 int byteVal = (int)(bytes[i] & 0xFF);
20
21 if(byteVal > 127) {
22 urlEncoded += String.format("%%%02X", (int)byteVal);
23 } else {
24 urlEncoded += String.format("%c",(char)byteVal);
25 }
26 }
27 }
28 catch (Exception e) {
29 e.printStackTrace();
30 }
31
32 return urlEncoded;
33 }
34
35 // For unicode codepoints see:
36 // http://unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT for ISO8859-1 (Latin-1)
37 // where 0xE2 maps to codepoint 0x00E2 and is defined as "Latin small letter a with circumflex"
38 // http://unicode.org/Public/MAPPINGS/ISO8859/8859-7.TXT for ISO8859-7 (Greek)
39 // where 0xE2 maps to codepoint 0x03B2 and is defined as "Greek small letter beta"
40
41 public static String iso_8859_1_filename_to_url_encoded(String raw_bytes_filename)
42 {
43 String urlEncoded = "";
44
45 try {
46 // By this point we have a UTF-8 encoded string that captures
47 // what the ISO-8859-1 (Latin-1) character is that corresponded to the
48 // 8-bit numeric value for that character in the filename
49 // on the file system
50
51 // For example:
52 // File system char: <lower-case beta char in Latin-7> = %E2
53 // Equivalent Latin 1 char: <lower-case a with circumflex> = %E2
54 // Mapped to UTF-8: <lower-case a with circumflex> = <C3><A2>
55
56 // Our task is to take the string the contains <C3><A2> and ensure that
57 // we "see" it as <E2>
58
59 byte [] raw_bytes = raw_bytes_filename.getBytes("ISO-8859-1");
60 String unicode_filename = new String(raw_bytes,"UTF-8");
61
62 for(int i = 0; i < unicode_filename.length(); i++) {
63 char charVal = unicode_filename.charAt(i);
64 if((int)charVal > 127) {
65 urlEncoded += String.format("%%%02X", (int)charVal);
66 } else {
67 urlEncoded += String.format("%c",(char)charVal);
68 }
69 }
70 }
71 catch (Exception e) {
72 e.printStackTrace();
73 }
74
75 return urlEncoded;
76 }
77
78
79 public static void main(String[] args)
80 {
81 if(args.length < 1) {
82 System.out.println("Usage: DirList <directory>");
83 System.exit(-1);
84 }
85
86
87 File folder = new File(args[0]);
88 File[] listOfFiles = folder.listFiles();
89
90 for (int i = 0; i < listOfFiles.length; i++) {
91 if (listOfFiles[i].isFile()) {
92 File file = listOfFiles[i];
93 URI filename_uri = file.toURI();
94 try {
95 // The trick:
96 // 1. toASCIIString() will %xx encode values > 127
97 // 2. Decode the result to "ISO-8859-1"
98 // 3. URL encode the bytes to string
99
100 // Step 2 forces the string to be 8-bit values. It
101 // doesn't matter if the starting raw filename was *not*
102 // in the ISO-8859-1 encoding, the effect is to ensure
103 // we have an 8-bit byte string that (numerically)
104 // captures the right value. These numerical values are
105 // then used to determine how to URL encode it
106
107 String filename_ascii = filename_uri.toASCIIString();
108 String filename_raw_bytes = URLDecoder.decode(filename_ascii,"ISO-8859-1");
109 String filename_url_encoded = iso_8859_1_filename_to_url_encoded(filename_raw_bytes);
110
111 filename_url_encoded = filename_url_encoded.replaceAll(" ","%20");
112
113 System.out.println("File " + filename_url_encoded);
114
115 try {
116 File test_file = new File(new URI(filename_url_encoded.replaceAll(" ","%20")));
117 if (test_file.exists()) {
118 System.out.println(" ... and I can see it!!!");
119 }
120 }
121 catch (Exception e) {
122 e.printStackTrace();
123 }
124
125
126 }
127 catch (Exception e) {
128 e.printStackTrace();
129 }
130 } else if (listOfFiles[i].isDirectory()) {
131 File sub_folder = listOfFiles[i];
132 System.out.println("Directory " + sub_folder.getName());
133 } else {
134 System.out.println("*** Not file or dir. Can't see file: " + listOfFiles[i]);
135 }
136 }
137 }
138
139}
Note: See TracBrowser for help on using the repository browser.