1 | #!/usr/bin/perl -w
|
---|
2 |
|
---|
3 | use File::Spec;
|
---|
4 |
|
---|
5 | sub nice_string {
|
---|
6 | join("",
|
---|
7 | map { $_ > 255 ? # if wide character...
|
---|
8 | sprintf("\\x{%04X}", $_) : # \x{...}
|
---|
9 | chr($_) =~ /[[:cntrl:]]/ ? # else if control character ...
|
---|
10 | sprintf("\\x%02X", $_) : # \x..
|
---|
11 | quotemeta(chr($_)) # else quoted or as themselves
|
---|
12 | } unpack("U*", $_[0])); # unpack Unicode characters
|
---|
13 | }
|
---|
14 |
|
---|
15 |
|
---|
16 | sub debug_unicode_string
|
---|
17 | {
|
---|
18 | join("",
|
---|
19 | map { $_ > 127 ? # if wide character...
|
---|
20 | sprintf("\\x{%04X}", $_) : # \x{...}
|
---|
21 | chr($_)
|
---|
22 | } unpack("U*", $_[0])); # unpack Unicode characters
|
---|
23 | }
|
---|
24 |
|
---|
25 |
|
---|
26 | # Returns true (1) if the given string is utf8 and false (0) if it isn't.
|
---|
27 | # Does not modify the string parameter.
|
---|
28 | sub check_is_utf8 {
|
---|
29 | my $value=shift;
|
---|
30 |
|
---|
31 | if (!defined($value)) {
|
---|
32 | return 0; # not utf8 because it is undefined
|
---|
33 | }
|
---|
34 |
|
---|
35 | $value =~ m/^/g; # to set \G
|
---|
36 | while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
|
---|
37 | my $highbytes=$1;
|
---|
38 | # make sure this block of high bytes is utf-8
|
---|
39 | $highbytes =~ /^/g; # set pos()
|
---|
40 | while ($highbytes =~
|
---|
41 | m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
|
---|
42 | [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
|
---|
43 | [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
|
---|
44 | [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
|
---|
45 | [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
|
---|
46 | )*([\x80-\xff])? !xg
|
---|
47 | ) {
|
---|
48 | my $badbyte=$1;
|
---|
49 | if (defined $badbyte) { # not end of string
|
---|
50 | return 0; # non-utf8 found
|
---|
51 | }
|
---|
52 | }
|
---|
53 | }
|
---|
54 |
|
---|
55 | return 1;
|
---|
56 | }
|
---|
57 |
|
---|
58 | if (scalar(@ARGV)!=1) {
|
---|
59 | my $prog_name = $0;
|
---|
60 | $prog_name =~ s/^.*(\\|\/)//;
|
---|
61 |
|
---|
62 | print STDERR "Usage: $prog_name dir\n";
|
---|
63 | exit(1);
|
---|
64 | }
|
---|
65 |
|
---|
66 |
|
---|
67 | my $dir = $ARGV[0];
|
---|
68 |
|
---|
69 | opendir(DIN,"$dir")
|
---|
70 | || die "Unable to open $dir";
|
---|
71 |
|
---|
72 | my @files = grep { $_ !~ m/^\./ } readdir(DIN);
|
---|
73 |
|
---|
74 | close(DIN);
|
---|
75 |
|
---|
76 | foreach my $f (@files)
|
---|
77 | {
|
---|
78 | print "\nFilename: $f ";
|
---|
79 | if(check_is_utf8($f)) {
|
---|
80 | print " - is utf8\n";
|
---|
81 | } else {
|
---|
82 | print " - is not in utf8\n";
|
---|
83 | }
|
---|
84 |
|
---|
85 |
|
---|
86 | if ($f !~ m/(txt|xml|html|item)$/) {
|
---|
87 | print "Skipping file content check for $f\n";
|
---|
88 | next;
|
---|
89 | }
|
---|
90 |
|
---|
91 | my $os = $^O;
|
---|
92 |
|
---|
93 | if ($os =~ m/mswin/i) {
|
---|
94 | $f = "$dir\\$f";
|
---|
95 | }
|
---|
96 | else {
|
---|
97 | $f = "$dir/$f";
|
---|
98 | }
|
---|
99 |
|
---|
100 |
|
---|
101 | # slurp the file and then print if its contents are utf8.
|
---|
102 | # 1. Read all the contents of the html into a string
|
---|
103 | # open the original file for reading
|
---|
104 | unless(open(FIN, "<$f")) {
|
---|
105 | print STDERR "Unable to open $f...ERROR: $!\n";
|
---|
106 | next;
|
---|
107 | #exit(-1);
|
---|
108 | }
|
---|
109 |
|
---|
110 | my $contents;
|
---|
111 | {
|
---|
112 | local $/ = undef; # Read entire file at once
|
---|
113 | $contents = <FIN>; # Now file is read in as one single 'line'
|
---|
114 | }
|
---|
115 | close(FIN); # close the file
|
---|
116 |
|
---|
117 | if(check_is_utf8($contents)) {
|
---|
118 | print "\tcontents are utf8\n";
|
---|
119 | } else {
|
---|
120 | print "\tcontents are not utf8\n";
|
---|
121 | }
|
---|
122 | }
|
---|