#!/usr/bin/perl -w use File::Spec; sub nice_string { join("", map { $_ > 255 ? # if wide character... sprintf("\\x{%04X}", $_) : # \x{...} chr($_) =~ /[[:cntrl:]]/ ? # else if control character ... sprintf("\\x%02X", $_) : # \x.. quotemeta(chr($_)) # else quoted or as themselves } unpack("U*", $_[0])); # unpack Unicode characters } sub debug_unicode_string { join("", map { $_ > 128 ? # if wide character... sprintf("\\x{%04X}", $_) : # \x{...} chr($_) } unpack("U*", $_[0])); # unpack Unicode characters } # Returns true (1) if the given string is utf8 and false (0) if it isn't. # Does not modify the string parameter. sub check_is_utf8 { my $value=shift; if (!defined($value)) { return 0; # not utf8 because it is undefined } $value =~ m/^/g; # to set \G while ($value =~ m!\G.*?([\x80-\xff]+)!sg) { my $highbytes=$1; # make sure this block of high bytes is utf-8 $highbytes =~ /^/g; # set pos() while ($highbytes =~ m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte )*([\x80-\xff])? !xg ) { my $badbyte=$1; if (defined $badbyte) { # not end of string return 0; # non-utf8 found } } } return 1; } if (scalar(@ARGV)!=1) { my $prog_name = $0; $prog_name =~ s/^.*(\\|\/)//; print STDERR "Usage: $prog_name dir\n"; exit(1); } my $dir = $ARGV[0]; opendir(DIN,"$dir") || die "Unable to open $dir"; my @files = grep { $_ !~ m/^\./ } readdir(DIN); close(DIN); foreach my $f (@files) { print "\nFilename: $f "; if(check_is_utf8($f)) { print " - is utf8\n"; } else { print " - is not in utf8\n"; } if ($f !~ m/(txt|xml|html)$/) { print "Skipping file content check for $f\n"; next; } my $os = $^O; if ($os =~ m/mswin/i) { $f = "$dir\\$f"; } else { $f = "$dir/$f"; } # slurp the file and then print if its contents are utf8. # 1. Read all the contents of the html into a string # open the original file for reading unless(open(FIN, "<$f")) { print STDERR "Unable to open $f...ERROR: $!\n"; next; #exit(-1); } my $contents; { local $/ = undef; # Read entire file at once $contents = ; # Now file is read in as one single 'line' } close(FIN); # close the file if(check_is_utf8($contents)) { print "\tcontents are utf8\n"; } else { print "\tcontents are not utf8\n"; } }