Context Navigation

source: test-collections/trunk/filename-encodings/bin/script/isutf8.pl@ 23830

Last change on this file since 23830 was 23830, checked in by ak19, 13 years ago
Fixed a minor off by one error: wide characters printed as unicode if greater than 127 instead of 128.
File size: 2.8 KB

Line
1	#!/usr/bin/perl -w
2
3	use File::Spec;
4
5	sub nice_string {
6	join("",
7	map { $_ > 255 ? # if wide character...
8	sprintf("\\x{%04X}", $_) : # \x{...}
9	chr($_) =~ /[[:cntrl:]]/ ? # else if control character ...
10	sprintf("\\x%02X", $_) : # \x..
11	quotemeta(chr($_)) # else quoted or as themselves
12	} unpack("U*", $_[0])); # unpack Unicode characters
13	}
14
15
16	sub debug_unicode_string
17	{
18	join("",
19	map { $_ > 127 ? # if wide character...
20	sprintf("\\x{%04X}", $_) : # \x{...}
21	chr($_)
22	} unpack("U*", $_[0])); # unpack Unicode characters
23	}
24
25
26	# Returns true (1) if the given string is utf8 and false (0) if it isn't.
27	# Does not modify the string parameter.
28	sub check_is_utf8 {
29	my $value=shift;
30
31	if (!defined($value)) {
32	return 0; # not utf8 because it is undefined
33	}
34
35	$value =~ m/^/g; # to set \G
36	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
37	my $highbytes=$1;
38	# make sure this block of high bytes is utf-8
39	$highbytes =~ /^/g; # set pos()
40	while ($highbytes =~
41	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
42	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
43	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
44	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
45	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
46	)*([\x80-\xff])? !xg
47	) {
48	my $badbyte=$1;
49	if (defined $badbyte) { # not end of string
50	return 0; # non-utf8 found
51	}
52	}
53	}
54
55	return 1;
56	}
57
58	if (scalar(@ARGV)!=1) {
59	my $prog_name = $0;
60	$prog_name =~ s/^.*(\\\|\/)//;
61
62	print STDERR "Usage: $prog_name dir\n";
63	exit(1);
64	}
65
66
67	my $dir = $ARGV[0];
68
69	opendir(DIN,"$dir")
70	\|\| die "Unable to open $dir";
71
72	my @files = grep { $_ !~ m/^\./ } readdir(DIN);
73
74	close(DIN);
75
76	foreach my $f (@files)
77	{
78	print "\nFilename: $f ";
79	if(check_is_utf8($f)) {
80	print " - is utf8\n";
81	} else {
82	print " - is not in utf8\n";
83	}
84
85
86	if ($f !~ m/(txt\|xml\|html)$/) {
87	print "Skipping file content check for $f\n";
88	next;
89	}
90
91	my $os = $^O;
92
93	if ($os =~ m/mswin/i) {
94	$f = "$dir\\$f";
95	}
96	else {
97	$f = "$dir/$f";
98	}
99
100
101	# slurp the file and then print if its contents are utf8.
102	# 1. Read all the contents of the html into a string
103	# open the original file for reading
104	unless(open(FIN, "<$f")) {
105	print STDERR "Unable to open $f...ERROR: $!\n";
106	next;
107	#exit(-1);
108	}
109
110	my $contents;
111	{
112	local $/ = undef; # Read entire file at once
113	$contents = <FIN>; # Now file is read in as one single 'line'
114	}
115	close(FIN); # close the file
116
117	if(check_is_utf8($contents)) {
118	print "\tcontents are utf8\n";
119	} else {
120	print "\tcontents are not utf8\n";
121	}
122	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: