1 | use strict;
|
---|
2 |
|
---|
3 | use Encode qw(decode encode);
|
---|
4 |
|
---|
5 | binmode STDERR, ":encoding(utf8)";
|
---|
6 | binmode STDOUT, ":encoding(utf8)";
|
---|
7 |
|
---|
8 | my $DEBUG = 0;
|
---|
9 |
|
---|
10 | sub debug_unicode_string
|
---|
11 | {
|
---|
12 | join("",
|
---|
13 | map { $_ > 127 ? # if wide character...
|
---|
14 | sprintf("\\x{%04X}", $_) : # \x{...}
|
---|
15 | chr($_)
|
---|
16 | } unpack("U*", $_[0])); # unpack Unicode characters
|
---|
17 | }
|
---|
18 |
|
---|
19 |
|
---|
20 | sub debug_unicode_string_bytes
|
---|
21 | {
|
---|
22 | join("",
|
---|
23 | map { $_ > 127 ? # if wide character...
|
---|
24 | sprintf("\\u{%02X}", $_) : # \x{...}
|
---|
25 | chr($_)
|
---|
26 | } unpack("U*", $_[0])); # unpack Unicode characters
|
---|
27 | }
|
---|
28 |
|
---|
29 |
|
---|
30 | # Returns true (1) if the given string is utf8 and false (0) if it isn't.
|
---|
31 | # Does not modify the string parameter.
|
---|
32 | sub check_is_utf8 {
|
---|
33 | my $value=shift;
|
---|
34 |
|
---|
35 | if (!defined($value)) {
|
---|
36 | return 0; # not utf8 because it is undefined
|
---|
37 | }
|
---|
38 |
|
---|
39 | $value =~ m/^/g; # to set \G
|
---|
40 | while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
|
---|
41 | my $highbytes=$1;
|
---|
42 | # make sure this block of high bytes is utf-8
|
---|
43 | $highbytes =~ /^/g; # set pos()
|
---|
44 | while ($highbytes =~
|
---|
45 | m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
|
---|
46 | [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
|
---|
47 | [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
|
---|
48 | [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
|
---|
49 | [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
|
---|
50 | )*([\x80-\xff])? !xg
|
---|
51 | ) {
|
---|
52 | my $badbyte=$1;
|
---|
53 | if (defined $badbyte) { # not end of string
|
---|
54 | return 0; # non-utf8 found
|
---|
55 | }
|
---|
56 | }
|
---|
57 | }
|
---|
58 |
|
---|
59 | return 1;
|
---|
60 | }
|
---|
61 |
|
---|
62 |
|
---|
63 | sub unescapeEntities
|
---|
64 | {
|
---|
65 | my ($line) = @_;
|
---|
66 |
|
---|
67 | # unescape reserved entities
|
---|
68 | $line =~ s/'/'/g;
|
---|
69 | $line =~ s/"/"/g;
|
---|
70 | $line =~ s/</</g;
|
---|
71 | $line =~ s/>/>/g;
|
---|
72 | $line =~ s/&/&/g;
|
---|
73 |
|
---|
74 | return $line;
|
---|
75 | }
|
---|
76 |
|
---|
77 |
|
---|
78 | sub getTagPairs
|
---|
79 | {
|
---|
80 | my ($line,$opt_filter) = @_;
|
---|
81 |
|
---|
82 | my @filtered_tag_pairs = ();
|
---|
83 |
|
---|
84 | my @tag_pairs = split(/(<[^>]*>)/,$line);
|
---|
85 | shift @tag_pairs; # given the split RE, there's always an empty-string first match
|
---|
86 |
|
---|
87 | my $tag_pairs_len = scalar(@tag_pairs);
|
---|
88 |
|
---|
89 | for (my $i=0; $i<$tag_pairs_len; $i+=2) {
|
---|
90 | my $tag = $tag_pairs[$i];
|
---|
91 | my $inner_text = $tag_pairs[$i+1];
|
---|
92 | $inner_text = "" if $inner_text eq "\n";
|
---|
93 |
|
---|
94 | if (!defined $opt_filter) {
|
---|
95 | push(@filtered_tag_pairs,$tag,$inner_text);
|
---|
96 | }
|
---|
97 | elsif ($tag =~ m/^<$opt_filter/i) {
|
---|
98 | push(@filtered_tag_pairs,$tag,$inner_text);
|
---|
99 | }
|
---|
100 | }
|
---|
101 |
|
---|
102 |
|
---|
103 | return \@filtered_tag_pairs;
|
---|
104 | }
|
---|
105 |
|
---|
106 | sub getInnerTextFromTags
|
---|
107 | {
|
---|
108 | my ($line,$opt_filter) = @_;
|
---|
109 |
|
---|
110 | my @inner_text_vals = ();
|
---|
111 |
|
---|
112 | my $filtered_tag_pairs = getTagPairs($line,$opt_filter);
|
---|
113 |
|
---|
114 | my $filtered_tag_pairs_len = scalar(@$filtered_tag_pairs);
|
---|
115 |
|
---|
116 | for (my $i=0; $i<$filtered_tag_pairs_len; $i+=2) {
|
---|
117 | my $tag = $filtered_tag_pairs->[$i];
|
---|
118 | my $inner_text = $filtered_tag_pairs->[$i+1];
|
---|
119 |
|
---|
120 | if ($inner_text ne "") {
|
---|
121 | my $inner_text_unescaped = unescapeEntities($inner_text);
|
---|
122 |
|
---|
123 | push(@inner_text_vals,$inner_text_unescaped);
|
---|
124 | }
|
---|
125 | }
|
---|
126 |
|
---|
127 | my $inner_text_vals_len = scalar(@inner_text_vals);
|
---|
128 |
|
---|
129 | if ($DEBUG) {
|
---|
130 | if ($inner_text_vals_len > 0) {
|
---|
131 | print "Filtered tags, innner text: [\n";
|
---|
132 | map {print " $_\n"} @inner_text_vals;
|
---|
133 | print "]\n";
|
---|
134 | }
|
---|
135 | }
|
---|
136 |
|
---|
137 | return \@inner_text_vals;
|
---|
138 | }
|
---|
139 |
|
---|
140 | sub processInnerTextVals
|
---|
141 | {
|
---|
142 | my ($inner_text_vals,$line_num) = @_;
|
---|
143 |
|
---|
144 | for my $line (@{$inner_text_vals}) {
|
---|
145 |
|
---|
146 | my $is_utf8 = check_is_utf8($line);
|
---|
147 | if ($is_utf8) {
|
---|
148 | # if ($line =~ m/[^[:ascii:]]/) {
|
---|
149 |
|
---|
150 | # from_to($line, "iso-8859-1", "UTF-8"); #1 -- does not turn on unicode-aware flag
|
---|
151 | # my $line_utf8 = decode("iso-8859-1", $line); #2 -- does turn on unicode-aware flag
|
---|
152 |
|
---|
153 | my $line_utf8 = decode("utf8", $line); #2 -- does turn on unicode-aware flag
|
---|
154 |
|
---|
155 | if ($line =~ m/[^\x{00}-\x{7F}]/) {
|
---|
156 | print "$line_num:\tUses Non-ASCII chars:\t$line_utf8\n";
|
---|
157 | print " UTF8 Bytes > 127:\t", debug_unicode_string_bytes($line), "\n";
|
---|
158 | print " Unicode codepoints > 127:\t", debug_unicode_string($line_utf8), "\n";
|
---|
159 | print "-" x 20, "\n";
|
---|
160 | }
|
---|
161 | }
|
---|
162 | else {
|
---|
163 | # my $line_utf8 = decode("utf8", $line); # decode() turns on unicode-aware flag
|
---|
164 |
|
---|
165 | print "$line_num:\tNOT VALID UTF8\n";
|
---|
166 | print " UTF8 Bytes > 127:\t", debug_unicode_string_bytes($line), "\n";
|
---|
167 | print " As Latin1 mapping:\t$line", "\n";
|
---|
168 | print "-" x 20, "\n";
|
---|
169 | }
|
---|
170 | }
|
---|
171 |
|
---|
172 | }
|
---|
173 |
|
---|
174 | sub processLine
|
---|
175 | {
|
---|
176 | my ($line_num,$line) = @_;
|
---|
177 |
|
---|
178 | my $inner_text_vals = getInnerTextFromTags($line,"subfield|marc:subfield");
|
---|
179 |
|
---|
180 | processInnerTextVals($inner_text_vals,$line_num);
|
---|
181 |
|
---|
182 | # # strip out tags
|
---|
183 | # $line =~ s/<[^>]*>/ /g;
|
---|
184 |
|
---|
185 | # # unescape reserved entities
|
---|
186 | # $line = unescapeEntities($line);
|
---|
187 |
|
---|
188 | }
|
---|
189 |
|
---|
190 | sub processFile
|
---|
191 | {
|
---|
192 | my ($filename) = @_;
|
---|
193 |
|
---|
194 |
|
---|
195 | my $line_num = 0;
|
---|
196 |
|
---|
197 | #if (open FILE, '<:encoding(utf8)', $filename) {
|
---|
198 | # Explicitly forcing to Latin1 causes the text file to be read in a 1 byte per char
|
---|
199 | if (open FILE, "< :encoding(Latin1)", $filename) {
|
---|
200 | #if (open FILE, $filename) {
|
---|
201 | while(<FILE>) {
|
---|
202 | my $line .= $_;
|
---|
203 | $line_num++;
|
---|
204 |
|
---|
205 | processLine($line_num,$line)
|
---|
206 | }
|
---|
207 | close FILE;
|
---|
208 | }
|
---|
209 | else {
|
---|
210 | print STDERR "Failed to open '$filename'\n";
|
---|
211 | print $!;
|
---|
212 | }
|
---|
213 | }
|
---|
214 |
|
---|
215 | my $filename = $ARGV[0];
|
---|
216 |
|
---|
217 |
|
---|
218 | processFile($filename);
|
---|
219 |
|
---|
220 |
|
---|
221 | #perl -ne 'print if /[^[:ascii:]]/' $*
|
---|
222 |
|
---|
223 | #<subfield code="a">Ren
|
---|
224 |
|
---|
225 | # <subfield code="a">Ren
|
---|