[23325] | 1 | #!/usr/bin/perl -w
|
---|
| 2 |
|
---|
| 3 | use File::Spec;
|
---|
| 4 |
|
---|
| 5 | sub nice_string {
|
---|
| 6 | join("",
|
---|
| 7 | map { $_ > 255 ? # if wide character...
|
---|
| 8 | sprintf("\\x{%04X}", $_) : # \x{...}
|
---|
| 9 | chr($_) =~ /[[:cntrl:]]/ ? # else if control character ...
|
---|
| 10 | sprintf("\\x%02X", $_) : # \x..
|
---|
| 11 | quotemeta(chr($_)) # else quoted or as themselves
|
---|
| 12 | } unpack("U*", $_[0])); # unpack Unicode characters
|
---|
| 13 | }
|
---|
| 14 |
|
---|
| 15 |
|
---|
| 16 | sub debug_unicode_string
|
---|
| 17 | {
|
---|
| 18 | join("",
|
---|
| 19 | map { $_ > 128 ? # if wide character...
|
---|
| 20 | sprintf("\\x{%04X}", $_) : # \x{...}
|
---|
| 21 | chr($_)
|
---|
| 22 | } unpack("U*", $_[0])); # unpack Unicode characters
|
---|
| 23 | }
|
---|
| 24 |
|
---|
| 25 |
|
---|
| 26 | # Returns true (1) if the given string is utf8 and false (0) if it isn't.
|
---|
| 27 | # Does not modify the string parameter.
|
---|
| 28 | sub check_is_utf8 {
|
---|
| 29 | my $value=shift;
|
---|
| 30 |
|
---|
| 31 | if (!defined($value)) {
|
---|
| 32 | return 0; # not utf8 because it is undefined
|
---|
| 33 | }
|
---|
| 34 |
|
---|
| 35 | $value =~ m/^/g; # to set \G
|
---|
| 36 | while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
|
---|
| 37 | my $highbytes=$1;
|
---|
| 38 | # make sure this block of high bytes is utf-8
|
---|
| 39 | $highbytes =~ /^/g; # set pos()
|
---|
| 40 | while ($highbytes =~
|
---|
| 41 | m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
|
---|
| 42 | [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
|
---|
| 43 | [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
|
---|
| 44 | [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
|
---|
| 45 | [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
|
---|
| 46 | )*([\x80-\xff])? !xg
|
---|
| 47 | ) {
|
---|
| 48 | my $badbyte=$1;
|
---|
| 49 | if (defined $badbyte) { # not end of string
|
---|
| 50 | return 0; # non-utf8 found
|
---|
| 51 | }
|
---|
| 52 | }
|
---|
| 53 | }
|
---|
| 54 |
|
---|
| 55 | return 1;
|
---|
| 56 | }
|
---|
| 57 |
|
---|
| 58 | if (scalar(@ARGV)!=1) {
|
---|
| 59 | my $prog_name = $0;
|
---|
| 60 | $prog_name =~ s/^.*(\\|\/)//;
|
---|
| 61 |
|
---|
| 62 | print STDERR "Usage: $prog_name dir\n";
|
---|
| 63 | exit(1);
|
---|
| 64 | }
|
---|
| 65 |
|
---|
| 66 |
|
---|
| 67 | my $dir = $ARGV[0];
|
---|
| 68 |
|
---|
| 69 | opendir(DIN,"$dir")
|
---|
| 70 | || die "Unable to open $dir";
|
---|
| 71 |
|
---|
| 72 | my @files = grep { $_ !~ m/^\./ } readdir(DIN);
|
---|
| 73 |
|
---|
| 74 | close(DIN);
|
---|
| 75 |
|
---|
| 76 | foreach my $f (@files)
|
---|
| 77 | {
|
---|
| 78 | print "\nFilename: $f ";
|
---|
| 79 | if(check_is_utf8($f)) {
|
---|
| 80 | print " - is utf8\n";
|
---|
| 81 | } else {
|
---|
| 82 | print " - is not in utf8\n";
|
---|
| 83 | }
|
---|
| 84 |
|
---|
| 85 |
|
---|
| 86 | if ($f !~ m/(txt|xml|html)$/) {
|
---|
| 87 | print "Skipping file content check for $f\n";
|
---|
| 88 | next;
|
---|
| 89 | }
|
---|
| 90 |
|
---|
| 91 | my $os = $^O;
|
---|
| 92 |
|
---|
| 93 | if ($os =~ m/mswin/i) {
|
---|
| 94 | $f = "$dir\\$f";
|
---|
| 95 | }
|
---|
| 96 | else {
|
---|
| 97 | $f = "$dir/$f";
|
---|
| 98 | }
|
---|
| 99 |
|
---|
| 100 |
|
---|
| 101 | # slurp the file and then print if its contents are utf8.
|
---|
| 102 | # 1. Read all the contents of the html into a string
|
---|
| 103 | # open the original file for reading
|
---|
| 104 | unless(open(FIN, "<$f")) {
|
---|
| 105 | print STDERR "Unable to open $f...ERROR: $!\n";
|
---|
| 106 | next;
|
---|
| 107 | #exit(-1);
|
---|
| 108 | }
|
---|
| 109 |
|
---|
| 110 | my $contents;
|
---|
| 111 | {
|
---|
| 112 | local $/ = undef; # Read entire file at once
|
---|
| 113 | $contents = <FIN>; # Now file is read in as one single 'line'
|
---|
| 114 | }
|
---|
| 115 | close(FIN); # close the file
|
---|
| 116 |
|
---|
| 117 | if(check_is_utf8($contents)) {
|
---|
| 118 | print "\tcontents are utf8\n";
|
---|
| 119 | } else {
|
---|
| 120 | print "\tcontents are not utf8\n";
|
---|
| 121 | }
|
---|
| 122 | }
|
---|