source: main/trunk/model-sites-dev/von-sparql/collect/nz-natlib-cat/pre-import/HUNT-FOR-NON-ASCII.pl@ 36291

Last change on this file since 36291 was 36291, checked in by davidb, 22 months ago

Some basic tag processing of MARCXML records, printing out lines with non-ASCII chars

File size: 5.2 KB
Line 
1use strict;
2
3use Encode qw(decode encode);
4
5binmode STDERR, ":encoding(utf8)";
6binmode STDOUT, ":encoding(utf8)";
7
8my $DEBUG = 0;
9
10sub debug_unicode_string
11{
12 join("",
13 map { $_ > 127 ? # if wide character...
14 sprintf("\\x{%04X}", $_) : # \x{...}
15 chr($_)
16 } unpack("U*", $_[0])); # unpack Unicode characters
17}
18
19
20sub debug_unicode_string_bytes
21{
22 join("",
23 map { $_ > 127 ? # if wide character...
24 sprintf("\\u{%02X}", $_) : # \x{...}
25 chr($_)
26 } unpack("U*", $_[0])); # unpack Unicode characters
27}
28
29
30# Returns true (1) if the given string is utf8 and false (0) if it isn't.
31# Does not modify the string parameter.
32sub check_is_utf8 {
33 my $value=shift;
34
35 if (!defined($value)) {
36 return 0; # not utf8 because it is undefined
37 }
38
39 $value =~ m/^/g; # to set \G
40 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
41 my $highbytes=$1;
42 # make sure this block of high bytes is utf-8
43 $highbytes =~ /^/g; # set pos()
44 while ($highbytes =~
45 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
46 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
47 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
48 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
49 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
50 )*([\x80-\xff])? !xg
51 ) {
52 my $badbyte=$1;
53 if (defined $badbyte) { # not end of string
54 return 0; # non-utf8 found
55 }
56 }
57 }
58
59 return 1;
60}
61
62
63sub unescapeEntities
64{
65 my ($line) = @_;
66
67 # unescape reserved entities
68 $line =~ s/'/'/g;
69 $line =~ s/"/"/g;
70 $line =~ s/&lt;/</g;
71 $line =~ s/&gt;/>/g;
72 $line =~ s/&amp;/&/g;
73
74 return $line;
75}
76
77
78sub getTagPairs
79{
80 my ($line,$opt_filter) = @_;
81
82 my @filtered_tag_pairs = ();
83
84 my @tag_pairs = split(/(<[^>]*>)/,$line);
85 shift @tag_pairs; # given the split RE, there's always an empty-string first match
86
87 my $tag_pairs_len = scalar(@tag_pairs);
88
89 for (my $i=0; $i<$tag_pairs_len; $i+=2) {
90 my $tag = $tag_pairs[$i];
91 my $inner_text = $tag_pairs[$i+1];
92 $inner_text = "" if $inner_text eq "\n";
93
94 if (!defined $opt_filter) {
95 push(@filtered_tag_pairs,$tag,$inner_text);
96 }
97 elsif ($tag =~ m/^<$opt_filter/i) {
98 push(@filtered_tag_pairs,$tag,$inner_text);
99 }
100 }
101
102
103 return \@filtered_tag_pairs;
104}
105
106sub getInnerTextFromTags
107{
108 my ($line,$opt_filter) = @_;
109
110 my @inner_text_vals = ();
111
112 my $filtered_tag_pairs = getTagPairs($line,$opt_filter);
113
114 my $filtered_tag_pairs_len = scalar(@$filtered_tag_pairs);
115
116 for (my $i=0; $i<$filtered_tag_pairs_len; $i+=2) {
117 my $tag = $filtered_tag_pairs->[$i];
118 my $inner_text = $filtered_tag_pairs->[$i+1];
119
120 if ($inner_text ne "") {
121 my $inner_text_unescaped = unescapeEntities($inner_text);
122
123 push(@inner_text_vals,$inner_text_unescaped);
124 }
125 }
126
127 my $inner_text_vals_len = scalar(@inner_text_vals);
128
129 if ($DEBUG) {
130 if ($inner_text_vals_len > 0) {
131 print "Filtered tags, innner text: [\n";
132 map {print " $_\n"} @inner_text_vals;
133 print "]\n";
134 }
135 }
136
137 return \@inner_text_vals;
138}
139
140sub processInnerTextVals
141{
142 my ($inner_text_vals,$line_num) = @_;
143
144 for my $line (@{$inner_text_vals}) {
145
146 my $is_utf8 = check_is_utf8($line);
147 if ($is_utf8) {
148 # if ($line =~ m/[^[:ascii:]]/) {
149
150 # from_to($line, "iso-8859-1", "UTF-8"); #1 -- does not turn on unicode-aware flag
151 # my $line_utf8 = decode("iso-8859-1", $line); #2 -- does turn on unicode-aware flag
152
153 my $line_utf8 = decode("utf8", $line); #2 -- does turn on unicode-aware flag
154
155 if ($line =~ m/[^\x{00}-\x{7F}]/) {
156 print "$line_num:\tUses Non-ASCII chars:\t$line_utf8\n";
157 print " UTF8 Bytes > 127:\t", debug_unicode_string_bytes($line), "\n";
158 print " Unicode codepoints > 127:\t", debug_unicode_string($line_utf8), "\n";
159 print "-" x 20, "\n";
160 }
161 }
162 else {
163 # my $line_utf8 = decode("utf8", $line); # decode() turns on unicode-aware flag
164
165 print "$line_num:\tNOT VALID UTF8\n";
166 print " UTF8 Bytes > 127:\t", debug_unicode_string_bytes($line), "\n";
167 print " As Latin1 mapping:\t$line", "\n";
168 print "-" x 20, "\n";
169 }
170 }
171
172}
173
174sub processLine
175{
176 my ($line_num,$line) = @_;
177
178 my $inner_text_vals = getInnerTextFromTags($line,"subfield|marc:subfield");
179
180 processInnerTextVals($inner_text_vals,$line_num);
181
182# # strip out tags
183# $line =~ s/<[^>]*>/ /g;
184
185# # unescape reserved entities
186# $line = unescapeEntities($line);
187
188}
189
190sub processFile
191{
192 my ($filename) = @_;
193
194
195 my $line_num = 0;
196
197 #if (open FILE, '<:encoding(utf8)', $filename) {
198 # Explicitly forcing to Latin1 causes the text file to be read in a 1 byte per char
199 if (open FILE, "< :encoding(Latin1)", $filename) {
200 #if (open FILE, $filename) {
201 while(<FILE>) {
202 my $line .= $_;
203 $line_num++;
204
205 processLine($line_num,$line)
206 }
207 close FILE;
208 }
209 else {
210 print STDERR "Failed to open '$filename'\n";
211 print $!;
212 }
213}
214
215my $filename = $ARGV[0];
216
217
218processFile($filename);
219
220
221#perl -ne 'print if /[^[:ascii:]]/' $*
222
223#<subfield code="a">Ren
224
225# <subfield code="a">Ren
Note: See TracBrowser for help on using the repository browser.