Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

HUNT-FOR-NON-ASCII.pl@ 36291

Last change on this file since 36291 was 36291, checked in by davidb, 22 months ago
Some basic tag processing of MARCXML records, printing out lines with non-ASCII chars
File size: 5.2 KB

Line
1	use strict;
2
3	use Encode qw(decode encode);
4
5	binmode STDERR, ":encoding(utf8)";
6	binmode STDOUT, ":encoding(utf8)";
7
8	my $DEBUG = 0;
9
10	sub debug_unicode_string
11	{
12	join("",
13	map { $_ > 127 ? # if wide character...
14	sprintf("\\x{%04X}", $_) : # \x{...}
15	chr($_)
16	} unpack("U*", $_[0])); # unpack Unicode characters
17	}
18
19
20	sub debug_unicode_string_bytes
21	{
22	join("",
23	map { $_ > 127 ? # if wide character...
24	sprintf("\\u{%02X}", $_) : # \x{...}
25	chr($_)
26	} unpack("U*", $_[0])); # unpack Unicode characters
27	}
28
29
30	# Returns true (1) if the given string is utf8 and false (0) if it isn't.
31	# Does not modify the string parameter.
32	sub check_is_utf8 {
33	my $value=shift;
34
35	if (!defined($value)) {
36	return 0; # not utf8 because it is undefined
37	}
38
39	$value =~ m/^/g; # to set \G
40	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
41	my $highbytes=$1;
42	# make sure this block of high bytes is utf-8
43	$highbytes =~ /^/g; # set pos()
44	while ($highbytes =~
45	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
46	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
47	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
48	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
49	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
50	)*([\x80-\xff])? !xg
51	) {
52	my $badbyte=$1;
53	if (defined $badbyte) { # not end of string
54	return 0; # non-utf8 found
55	}
56	}
57	}
58
59	return 1;
60	}
61
62
63	sub unescapeEntities
64	{
65	my ($line) = @_;
66
67	# unescape reserved entities
68	$line =~ s/'/'/g;
69	$line =~ s/"/"/g;
70	$line =~ s/</</g;
71	$line =~ s/>/>/g;
72	$line =~ s/&/&/g;
73
74	return $line;
75	}
76
77
78	sub getTagPairs
79	{
80	my ($line,$opt_filter) = @_;
81
82	my @filtered_tag_pairs = ();
83
84	my @tag_pairs = split(/(<[^>]*>)/,$line);
85	shift @tag_pairs; # given the split RE, there's always an empty-string first match
86
87	my $tag_pairs_len = scalar(@tag_pairs);
88
89	for (my $i=0; $i<$tag_pairs_len; $i+=2) {
90	my $tag = $tag_pairs[$i];
91	my $inner_text = $tag_pairs[$i+1];
92	$inner_text = "" if $inner_text eq "\n";
93
94	if (!defined $opt_filter) {
95	push(@filtered_tag_pairs,$tag,$inner_text);
96	}
97	elsif ($tag =~ m/^<$opt_filter/i) {
98	push(@filtered_tag_pairs,$tag,$inner_text);
99	}
100	}
101
102
103	return \@filtered_tag_pairs;
104	}
105
106	sub getInnerTextFromTags
107	{
108	my ($line,$opt_filter) = @_;
109
110	my @inner_text_vals = ();
111
112	my $filtered_tag_pairs = getTagPairs($line,$opt_filter);
113
114	my $filtered_tag_pairs_len = scalar(@$filtered_tag_pairs);
115
116	for (my $i=0; $i<$filtered_tag_pairs_len; $i+=2) {
117	my $tag = $filtered_tag_pairs->[$i];
118	my $inner_text = $filtered_tag_pairs->[$i+1];
119
120	if ($inner_text ne "") {
121	my $inner_text_unescaped = unescapeEntities($inner_text);
122
123	push(@inner_text_vals,$inner_text_unescaped);
124	}
125	}
126
127	my $inner_text_vals_len = scalar(@inner_text_vals);
128
129	if ($DEBUG) {
130	if ($inner_text_vals_len > 0) {
131	print "Filtered tags, innner text: [\n";
132	map {print " $_\n"} @inner_text_vals;
133	print "]\n";
134	}
135	}
136
137	return \@inner_text_vals;
138	}
139
140	sub processInnerTextVals
141	{
142	my ($inner_text_vals,$line_num) = @_;
143
144	for my $line (@{$inner_text_vals}) {
145
146	my $is_utf8 = check_is_utf8($line);
147	if ($is_utf8) {
148	# if ($line =~ m/[^[:ascii:]]/) {
149
150	# from_to($line, "iso-8859-1", "UTF-8"); #1 -- does not turn on unicode-aware flag
151	# my $line_utf8 = decode("iso-8859-1", $line); #2 -- does turn on unicode-aware flag
152
153	my $line_utf8 = decode("utf8", $line); #2 -- does turn on unicode-aware flag
154
155	if ($line =~ m/[^\x{00}-\x{7F}]/) {
156	print "$line_num:\tUses Non-ASCII chars:\t$line_utf8\n";
157	print " UTF8 Bytes > 127:\t", debug_unicode_string_bytes($line), "\n";
158	print " Unicode codepoints > 127:\t", debug_unicode_string($line_utf8), "\n";
159	print "-" x 20, "\n";
160	}
161	}
162	else {
163	# my $line_utf8 = decode("utf8", $line); # decode() turns on unicode-aware flag
164
165	print "$line_num:\tNOT VALID UTF8\n";
166	print " UTF8 Bytes > 127:\t", debug_unicode_string_bytes($line), "\n";
167	print " As Latin1 mapping:\t$line", "\n";
168	print "-" x 20, "\n";
169	}
170	}
171
172	}
173
174	sub processLine
175	{
176	my ($line_num,$line) = @_;
177
178	my $inner_text_vals = getInnerTextFromTags($line,"subfield\|marc:subfield");
179
180	processInnerTextVals($inner_text_vals,$line_num);
181
182	# # strip out tags
183	# $line =~ s/<[^>]*>/ /g;
184
185	# # unescape reserved entities
186	# $line = unescapeEntities($line);
187
188	}
189
190	sub processFile
191	{
192	my ($filename) = @_;
193
194
195	my $line_num = 0;
196
197	#if (open FILE, '<:encoding(utf8)', $filename) {
198	# Explicitly forcing to Latin1 causes the text file to be read in a 1 byte per char
199	if (open FILE, "< :encoding(Latin1)", $filename) {
200	#if (open FILE, $filename) {
201	while(<FILE>) {
202	my $line .= $_;
203	$line_num++;
204
205	processLine($line_num,$line)
206	}
207	close FILE;
208	}
209	else {
210	print STDERR "Failed to open '$filename'\n";
211	print $!;
212	}
213	}
214
215	my $filename = $ARGV[0];
216
217
218	processFile($filename);
219
220
221	#perl -ne 'print if /[^[:ascii:]]/' $*
222
223	#<subfield code="a">Ren
224
225	# <subfield code="a">Ren

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/model-sites-dev/von-sparql/collect/nz-natlib-cat/pre-import/HUNT-FOR-NON-ASCII.pl@ 36291

Download in other formats: