source: test-collections/trunk/filename-encodings/script/isutf8.pl@ 23325

Last change on this file since 23325 was 23325, checked in by davidb, 13 years ago

Utility scripts that help study how the file system is representing the filenames

File size: 2.8 KB
Line 
1#!/usr/bin/perl -w
2
3use File::Spec;
4
5 sub nice_string {
6 join("",
7 map { $_ > 255 ? # if wide character...
8 sprintf("\\x{%04X}", $_) : # \x{...}
9 chr($_) =~ /[[:cntrl:]]/ ? # else if control character ...
10 sprintf("\\x%02X", $_) : # \x..
11 quotemeta(chr($_)) # else quoted or as themselves
12 } unpack("U*", $_[0])); # unpack Unicode characters
13 }
14
15
16sub debug_unicode_string
17{
18 join("",
19 map { $_ > 128 ? # if wide character...
20 sprintf("\\x{%04X}", $_) : # \x{...}
21 chr($_)
22 } unpack("U*", $_[0])); # unpack Unicode characters
23}
24
25
26# Returns true (1) if the given string is utf8 and false (0) if it isn't.
27# Does not modify the string parameter.
28sub check_is_utf8 {
29 my $value=shift;
30
31 if (!defined($value)) {
32 return 0; # not utf8 because it is undefined
33 }
34
35 $value =~ m/^/g; # to set \G
36 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
37 my $highbytes=$1;
38 # make sure this block of high bytes is utf-8
39 $highbytes =~ /^/g; # set pos()
40 while ($highbytes =~
41 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
42 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
43 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
44 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
45 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
46 )*([\x80-\xff])? !xg
47 ) {
48 my $badbyte=$1;
49 if (defined $badbyte) { # not end of string
50 return 0; # non-utf8 found
51 }
52 }
53 }
54
55 return 1;
56}
57
58if (scalar(@ARGV)!=1) {
59 my $prog_name = $0;
60 $prog_name =~ s/^.*(\\|\/)//;
61
62 print STDERR "Usage: $prog_name dir\n";
63 exit(1);
64}
65
66
67my $dir = $ARGV[0];
68
69opendir(DIN,"$dir")
70 || die "Unable to open $dir";
71
72my @files = grep { $_ !~ m/^\./ } readdir(DIN);
73
74close(DIN);
75
76foreach my $f (@files)
77{
78 print "\nFilename: $f ";
79 if(check_is_utf8($f)) {
80 print " - is utf8\n";
81 } else {
82 print " - is not in utf8\n";
83 }
84
85
86 if ($f !~ m/(txt|xml|html)$/) {
87 print "Skipping file content check for $f\n";
88 next;
89 }
90
91 my $os = $^O;
92
93 if ($os =~ m/mswin/i) {
94 $f = "$dir\\$f";
95 }
96 else {
97 $f = "$dir/$f";
98 }
99
100
101 # slurp the file and then print if its contents are utf8.
102 # 1. Read all the contents of the html into a string
103 # open the original file for reading
104 unless(open(FIN, "<$f")) {
105 print STDERR "Unable to open $f...ERROR: $!\n";
106 next;
107 #exit(-1);
108 }
109
110 my $contents;
111 {
112 local $/ = undef; # Read entire file at once
113 $contents = <FIN>; # Now file is read in as one single 'line'
114 }
115 close(FIN); # close the file
116
117 if(check_is_utf8($contents)) {
118 print "\tcontents are utf8\n";
119 } else {
120 print "\tcontents are not utf8\n";
121 }
122}
Note: See TracBrowser for help on using the repository browser.