source: test-collections/trunk/filename-encodings/bin/script/isutf8.pl@ 23830

Last change on this file since 23830 was 23830, checked in by ak19, 13 years ago

Fixed a minor off by one error: wide characters printed as unicode if greater than 127 instead of 128.

File size: 2.8 KB
Line 
1#!/usr/bin/perl -w
2
3use File::Spec;
4
5 sub nice_string {
6 join("",
7 map { $_ > 255 ? # if wide character...
8 sprintf("\\x{%04X}", $_) : # \x{...}
9 chr($_) =~ /[[:cntrl:]]/ ? # else if control character ...
10 sprintf("\\x%02X", $_) : # \x..
11 quotemeta(chr($_)) # else quoted or as themselves
12 } unpack("U*", $_[0])); # unpack Unicode characters
13 }
14
15
16sub debug_unicode_string
17{
18 join("",
19 map { $_ > 127 ? # if wide character...
20 sprintf("\\x{%04X}", $_) : # \x{...}
21 chr($_)
22 } unpack("U*", $_[0])); # unpack Unicode characters
23}
24
25
26# Returns true (1) if the given string is utf8 and false (0) if it isn't.
27# Does not modify the string parameter.
28sub check_is_utf8 {
29 my $value=shift;
30
31 if (!defined($value)) {
32 return 0; # not utf8 because it is undefined
33 }
34
35 $value =~ m/^/g; # to set \G
36 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
37 my $highbytes=$1;
38 # make sure this block of high bytes is utf-8
39 $highbytes =~ /^/g; # set pos()
40 while ($highbytes =~
41 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
42 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
43 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
44 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
45 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
46 )*([\x80-\xff])? !xg
47 ) {
48 my $badbyte=$1;
49 if (defined $badbyte) { # not end of string
50 return 0; # non-utf8 found
51 }
52 }
53 }
54
55 return 1;
56}
57
58if (scalar(@ARGV)!=1) {
59 my $prog_name = $0;
60 $prog_name =~ s/^.*(\\|\/)//;
61
62 print STDERR "Usage: $prog_name dir\n";
63 exit(1);
64}
65
66
67my $dir = $ARGV[0];
68
69opendir(DIN,"$dir")
70 || die "Unable to open $dir";
71
72my @files = grep { $_ !~ m/^\./ } readdir(DIN);
73
74close(DIN);
75
76foreach my $f (@files)
77{
78 print "\nFilename: $f ";
79 if(check_is_utf8($f)) {
80 print " - is utf8\n";
81 } else {
82 print " - is not in utf8\n";
83 }
84
85
86 if ($f !~ m/(txt|xml|html)$/) {
87 print "Skipping file content check for $f\n";
88 next;
89 }
90
91 my $os = $^O;
92
93 if ($os =~ m/mswin/i) {
94 $f = "$dir\\$f";
95 }
96 else {
97 $f = "$dir/$f";
98 }
99
100
101 # slurp the file and then print if its contents are utf8.
102 # 1. Read all the contents of the html into a string
103 # open the original file for reading
104 unless(open(FIN, "<$f")) {
105 print STDERR "Unable to open $f...ERROR: $!\n";
106 next;
107 #exit(-1);
108 }
109
110 my $contents;
111 {
112 local $/ = undef; # Read entire file at once
113 $contents = <FIN>; # Now file is read in as one single 'line'
114 }
115 close(FIN); # close the file
116
117 if(check_is_utf8($contents)) {
118 print "\tcontents are utf8\n";
119 } else {
120 print "\tcontents are not utf8\n";
121 }
122}
Note: See TracBrowser for help on using the repository browser.