Context Navigation

source: gsdl/trunk/perllib/unicode.pm@ 18201

Last change on this file since 18201 was 18201, checked in by ak19, 15 years ago
When associated files are renamed with the URLencoded versions of their original filenames, the spaces are no longer URL encoded, as this conflicted with mp3, wmv and possibly other media file formats being opened in external or browser-embedded apps
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 17.4 KB

Line
1	###########################################################################
2	#
3	# unicode.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2004 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# useful functions for dealing with Unicode
27
28	# Unicode strings are stored as arrays of scalars as perl
29	# lacks characters are 8-bit (currently)
30
31	package unicode;
32
33	eval {require bytes};
34
35	use encodings;
36	use strict;
37	use util;
38
39	# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
40	# and returns a unicode array.
41	sub ascii2unicode {
42	my ($in) = @_;
43	my $out = [];
44
45	my $i = 0;
46	my $len = length($in);
47	while ($i < $len) {
48	push (@$out, ord(substr ($in, $i, 1)));
49	$i++;
50	}
51
52	return $out;
53	}
54
55	# ascii2utf8 takes a reference to an (extended) ascii string and returns a
56	# UTF-8 encoded string. This is just a faster version of
57	# "&unicode2utf8(&ascii2unicode($str));"
58	# "Extended ascii" really means "iso_8859_1"
59	sub ascii2utf8 {
60	my ($in) = @_;
61	my $out = "";
62
63	if (!defined($in)\|\| !defined($$in)) {
64	return $out;
65	}
66
67	my ($c);
68	my $i = 0;
69	my $len = length($$in);
70	while ($i < $len) {
71	$c = ord (substr ($$in, $i, 1));
72	if ($c < 0x80) {
73	# ascii character
74	$out .= chr ($c);
75
76	} else {
77	# extended ascii character
78	$out .= chr (0xc0 + (($c >> 6) & 0x1f));
79	$out .= chr (0x80 + ($c & 0x3f));
80	}
81	$i++;
82	}
83
84	return $out;
85	}
86
87	# unicode2utf8 takes a unicode array as input and encodes it
88	# using utf-8
89	sub unicode2utf8 {
90	my ($in) = @_;
91	my $out = "";
92
93	foreach my $num (@$in) {
94	next unless defined $num;
95	if ($num < 0x80) {
96	$out .= chr ($num);
97
98	} elsif ($num < 0x800) {
99	$out .= chr (0xc0 + (($num >> 6) & 0x1f));
100	$out .= chr (0x80 + ($num & 0x3f));
101
102	} elsif ($num < 0xFFFF) {
103	$out .= chr (0xe0 + (($num >> 12) & 0xf));
104	$out .= chr (0x80 + (($num >> 6) & 0x3f));
105	$out .= chr (0x80 + ($num & 0x3f));
106
107	} else {
108	# error, don't encode anything
109	die;
110	}
111	}
112	return $out;
113	}
114
115	# utf82unicode takes a utf-8 string and produces a unicode
116	# array
117	sub utf82unicode {
118	my ($in) = @_;
119	my $out = [];
120
121	my $i = 0;
122	my ($c1, $c2, $c3);
123	my $len = length($in);
124	while ($i < $len) {
125	if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
126	# normal ascii character
127	push (@$out, $c1);
128
129	} elsif ($c1 < 0xc0) {
130	# error, was expecting the first byte of an
131	# encoded character. Do nothing.
132
133	} elsif ($c1 < 0xe0 && $i+1 < $len) {
134	# an encoded character with two bytes
135	$c2 = ord (substr ($in, $i+1, 1));
136	if ($c2 >= 0x80 && $c2 < 0xc0) {
137	# everything looks ok
138	push (@$out, ((($c1 & 0x1f) << 6) +
139	($c2 & 0x3f)));
140	$i++; # gobbled an extra byte
141	}
142
143	} elsif ($c1 < 0xf0 && $i+2 < $len) {
144	# an encoded character with three bytes
145	$c2 = ord (substr ($in, $i+1, 1));
146	$c3 = ord (substr ($in, $i+2, 1));
147	if ($c2 >= 0x80 && $c2 < 0xc0 &&
148	$c3 >= 0x80 && $c3 < 0xc0) {
149	# everything looks ok
150	push (@$out, ((($c1 & 0xf) << 12) +
151	(($c2 & 0x3f) << 6) +
152	($c3 & 0x3f)));
153
154	$i += 2; # gobbled an extra two bytes
155	}
156
157	} else {
158	# error, only decode Unicode characters not full UCS.
159	# Do nothing.
160	}
161
162	$i++;
163	}
164
165	return $out;
166	}
167
168	# unicode2ucs2 takes a unicode array and produces a UCS-2
169	# unicode string (every two bytes forms a unicode character)
170	sub unicode2ucs2 {
171	my ($in) = @_;
172	my $out = "";
173
174	foreach my $num (@$in) {
175	$out .= chr (($num & 0xff00) >> 8);
176	$out .= chr ($num & 0xff);
177	}
178
179	return $out;
180	}
181
182	# ucs22unicode takes a UCS-2 string and produces a unicode array
183	sub ucs22unicode {
184	my ($in) = @_;
185	my $out = [];
186
187	my $i = 0;
188	my $len = length ($in);
189	while ($i+1 < $len) {
190	push (@$out, ord (substr($in, $i, 1)) << 8 +
191	ord (substr($in, $i+1, 1)));
192
193	$i ++;
194	}
195
196	return $out;
197	}
198
199	# takes a reference to a string and returns a reference to a unicode array
200	sub convert2unicode {
201	my ($encoding, $textref) = @_;
202
203	if (!defined $encodings::encodings->{$encoding}) {
204	print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
205	return [];
206	}
207
208	my $encodename = "$encoding-unicode";
209	my $enc_info = $encodings::encodings->{$encoding};
210	my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
211	"to_uc", $enc_info->{'mapfile'});
212	if (!&loadmapencoding ($encodename, $mapfile)) {
213	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
214	return [];
215	}
216
217	if (defined $enc_info->{'converter'}) {
218	my $converter = $enc_info->{'converter'};
219	return &$converter ($encodename, $textref);
220	}
221
222	if ($unicode::translations{$encodename}->{'count'} == 1) {
223	return &singlebyte2unicode ($encodename, $textref);
224	} else {
225	return &doublebyte2unicode ($encodename, $textref);
226	}
227	}
228
229	# singlebyte2unicode converts simple 8 bit encodings where characters below
230	# 0x80 are normal ascii characters and the rest are decoded using the
231	# appropriate mapping files.
232	#
233	# Examples of encodings that may be converted using singlebyte2unicode are
234	# the iso-8859 and windows-125* series.
235	sub singlebyte2unicode {
236	my ($encodename, $textref) = @_;
237
238	my @outtext = ();
239	my $len = length($$textref);
240	my ($c);
241	my $i = 0;
242
243	while ($i < $len) {
244	if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
245	# normal ascii character
246	push (@outtext, $c);
247	} else {
248	$c = &transchar ($encodename, $c);
249	# put a black square if cannot translate
250	$c = 0x25A1 if $c == 0;
251	push (@outtext, $c);
252	}
253	$i ++;
254	}
255	return \@outtext;
256	}
257
258	# doublebyte2unicode converts simple two byte encodings where characters
259	# below code point 0x80 are single-byte characters and the rest are
260	# double-byte characters.
261	#
262	# Examples of encodings that may be converted using doublebyte2unicode are
263	# CJK encodings like GB encoded Chinese and UHC Korean.
264	#
265	# Note that no error checking is performed to make sure that the input text
266	# is valid for the given encoding.
267	#
268	# Also, encodings that may contain characters of more than two bytes are
269	# not supported (any EUC encoded text may in theory contain 3-byte
270	# characters but in practice only one and two byte characters are used).
271	sub doublebyte2unicode {
272	my ($encodename, $textref) = @_;
273
274	my @outtext = ();
275	my $len = length($$textref);
276	my ($c1, $c2);
277	my $i = 0;
278
279	while ($i < $len) {
280	if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
281	if ($i+1 < $len) {
282	# double-byte character
283	$c2 = ord(substr($$textref, $i+1, 1));
284	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
285	# put a black square if cannot translate
286	$c = 0x25A1 if $c == 0;
287	push (@outtext, $c);
288	$i += 2;
289
290	} else {
291	# error
292	print STDERR "unicode: ERROR missing second half of double-byte character\n";
293	$i++;
294	}
295
296	} else {
297	# single-byte character
298	push (@outtext, $c1);
299	$i++;
300	}
301	}
302	return \@outtext;
303	}
304
305	# Shift-JIS to unicode
306	# We can't use doublebyte2unicode for Shift-JIS because it uses some
307	# single-byte characters above code point 0x80 (i.e. half-width katakana
308	# characters in the range 0xA1-0xDF)
309	sub shiftjis2unicode {
310	my ($encodename, $textref) = @_;
311
312	my @outtext = ();
313	my $len = length($$textref);
314	my ($c1, $c2);
315	my $i = 0;
316
317	while ($i < $len) {
318	$c1 = ord(substr($$textref, $i, 1));
319
320	if (($c1 >= 0xA1 && $c1 <= 0xDF) \|\| $c1 == 0x5c \|\| $c1 == 0x7E) {
321	# Single-byte half-width katakana character or
322	# JIS Roman yen or overline characters
323	my $c = &transchar ($encodename, $c1);
324	# - put a black square if cannot translate
325	$c = 0x25A1 if $c == 0;
326	push (@outtext, $c);
327	$i++;
328
329	} elsif ($c1 < 0x80) {
330	# ASCII
331	push (@outtext, $c1);
332	$i ++;
333
334	} elsif ($c1 < 0xEF) {
335	if ($i+1 < $len) {
336	$c2 = ord(substr($$textref, $i+1, 1));
337	if (($c2 >= 0x40 && $c2 <= 0x7E) \|\| ($c2 >= 0x80 && $c2 <= 0xFC)) {
338	# Double-byte shift-jis character
339	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
340	# put a black square if cannot translate
341	$c = 0x25A1 if $c == 0;
342	push (@outtext, $c);
343	} else {
344	# error
345	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
346	}
347	$i += 2;
348	} else {
349	# error
350	print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
351	$i ++;
352	}
353	} else {
354	# error
355	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
356	$i ++;
357	}
358	}
359	return \@outtext;
360	}
361
362	sub transchar {
363	my ($encoding, $from) = @_;
364	my $high = ($from / 256) % 256;
365	my $low = $from % 256;
366
367	return 0 unless defined $unicode::translations{$encoding};
368
369	my $block = $unicode::translations{$encoding}->{'map'};
370
371	if (ref ($block->[$high]) ne "ARRAY") {
372	return 0;
373	}
374	return $block->[$high]->[$low];
375	}
376
377	# %translations is of the form:
378	#
379	# encodings{encodingname-encodingname}->{'map'}->blocktranslation
380	# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
381	#
382	# Any of the top translation blocks can point to an undefined
383	# value. This data structure aims to allow fast translation and
384	# efficient storage.
385	%unicode::translations = ();
386
387	# @array256 is used for initialisation, there must be
388	# a better way...
389	# What about this?: @array256 = (0) x 256;
390	@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
391	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
392	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
393	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
394	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
395	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
396	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
397	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
398	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
399	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
400	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
401	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
402	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
403	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
404	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
405	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
406
407	# returns 1 if successful, 0 if unsuccessful
408	sub loadmapencoding {
409	my ($encoding, $mapfile) = @_;
410
411	# check to see if the encoding has already been loaded
412	return 1 if (defined $unicode::translations{$encoding});
413
414	if (! -r $mapfile \|\| -d $mapfile) {
415	return 0;
416	}
417	return 0 unless open (MAPFILE, $mapfile);
418	binmode (MAPFILE);
419
420	$unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
421	my $block = $unicode::translations{$encoding};
422
423	my ($in,$i,$j);
424	while (1) {
425	my $ret=read(MAPFILE, $in, 1);
426	if (!defined($ret)) { # error
427	print STDERR "unicode.pm: error reading mapfile: $!\n";
428	last;
429	}
430	if ($ret != 1) { last }
431	$i = unpack ("C", $in);
432	$block->{'map'}->[$i] = [@unicode::array256];
433	for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
434	my ($n1, $n2) = unpack ("CC", $in);
435	$block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
436	}
437	$block->{'count'} ++;
438	}
439
440	close (MAPFILE);
441	}
442
443	# unicode2singlebyte converts unicode to simple 8 bit encodings where
444	# characters below 0x80 are normal ascii characters and the rest are encoded
445	# using the appropriate mapping files.
446	#
447	# Examples of encodings that may be converted using unicode2singlebyte are
448	# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
449	sub unicode2singlebyte {
450	my ($uniref, $encoding) = @_;
451
452	my $outtext = "";
453	my $encodename = "unicode-$encoding";
454
455	if (!exists $encodings::encodings->{$encoding}) {
456	print STDERR "unicode.pm: ERROR - unsupported encoding "
457	. "'$encoding' requested\n";
458	return "";
459	}
460
461	my $enc_info = $encodings::encodings->{$encoding};
462	my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
463	"from_uc", $enc_info->{'mapfile'});
464	if (!&loadmapencoding ($encodename, $mapfile)) {
465	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
466	return "";
467	}
468
469	foreach my $c (@$uniref) {
470	if ($c < 0x80) {
471	# normal ascii character
472	$outtext .= chr($c);
473	} else {
474	# extended ascii character
475	$c = &transchar ($encodename, $c);
476
477	# put a question mark if cannot translate
478	if ($c == 0) {
479	$outtext .= "?";
480	} else {
481	$outtext .= chr($c);
482	}
483	}
484	}
485	return $outtext;
486	}
487
488
489	# this makes sure that the referenced input string is utf8 encoded, and
490	# will change/remove bytes that aren't.
491	# returns 0 if the text was already utf8, or 1 if text modified to become utf8
492	sub ensure_utf8 {
493	my $stringref=shift;
494
495	if (!defined($stringref) \|\| ref($stringref) ne 'SCALAR') {
496	return $stringref;
497	}
498
499	my $value=$$stringref;
500
501	my $non_utf8_found = 0;
502	$value =~ m/^/g; # to set \G
503	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
504	my $highbytes=$1;
505	my $highbyteslength=length($highbytes);
506	# make sure this block of high bytes is utf-8
507	$highbytes =~ /^/g; # set pos()
508	my $byte_replaced = 0;
509	while ($highbytes =~
510	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
511	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
512	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
513	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
514	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
515	)*([\x80-\xff])? !xg
516	) {
517	# this highbyte is "out-of-place" for valid utf-8
518	my $badbyte=$1;
519	if (!defined $badbyte) {next} # hit end of string
520	my $pos=pos($highbytes);
521	# replace bad byte. assume iso-8859-1 -> utf-8
522	# ascii2utf8 does "extended ascii"... ie iso-8859-1
523	my $replacement=&unicode::ascii2utf8(\$badbyte);
524	substr($highbytes, $pos-1, 1, $replacement);
525	# update the position to continue searching (for \G)
526	pos($highbytes) = $pos+length($replacement)-1;
527	$byte_replaced = 1;
528	}
529	if ($byte_replaced) {
530	# replace this block of high bytes in the $value
531	$non_utf8_found = 1;
532	my $replength=length($highbytes); # we've changed the length
533	my $textpos=pos($value); # pos at end of last match
534	# replace bad bytes with good bytes
535	substr($value, $textpos-$highbyteslength,
536	$highbyteslength, $highbytes);
537	# update the position to continue searching (for \G)
538	pos($value)=$textpos+($replength-$highbyteslength)+1;
539	}
540	}
541
542	$$stringref = $value;
543	return $non_utf8_found;
544	}
545
546	# Returns true (1) if the given string is utf8 and false (0) if it isn't.
547	# Does not modify the string parameter.
548	sub check_is_utf8 {
549	my $value=shift;
550
551	if (!defined($value)) {
552	return 0; # not utf8 because it is undefined
553	}
554
555	$value =~ m/^/g; # to set \G
556	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
557	my $highbytes=$1;
558	# make sure this block of high bytes is utf-8
559	$highbytes =~ /^/g; # set pos()
560	while ($highbytes =~
561	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
562	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
563	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
564	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
565	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
566	)*([\x80-\xff])? !xg
567	) {
568	my $badbyte=$1;
569	if (defined $badbyte) { # not end of string
570	return 0; # non-utf8 found
571	}
572	}
573	}
574
575	return 1;
576	}
577
578	sub url_encode {
579	my ($text) = @_;
580	if (!&is_url_encoded($text)) {
581	$text =~ s/([^A-Z0-9\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
582	}
583	return $text;
584	}
585
586	sub url_decode {
587	my ($text) = @_;
588
589	$text =~ s/\%([A-F0-9]{2})/pack('C', hex($1))/ige;
590	return $text;
591	}
592
593	sub is_url_encoded {
594	my ($text) = @_;
595	return ($text =~ m/\%([A-F0-9]{2})/);
596	}
597
598	# When a filename on the filesystem is already URL-encoded, the
599	# URL to it will have %25s in in place of every % sign, so that
600	# URLs in html pages can refer to the URL-encoded filename.
601	# This method changes the URL reference back into the actual
602	# (URL-encoded) filename on the filesystem by replacing %25 with %.
603	sub url_to_filename {
604	my ($text) =@_;
605	$text =~ s/%25/%/g;
606	return $text;
607	}
608
609	# When a filename on the filesystem is already URL-encoded, the
610	# URL to it will have %25s in in place of every % sign, so that
611	# URLs in html pages can refer to the URL-encoded filename.
612	# Given a (URL-encoded) filename on the filesystem, this subroutine
613	# returns the URL reference string for it by replacing % with %25.
614	# The output string will be the same as the input string if the input
615	# already contains one or more %25s. This is to prevent processing
616	# a url more than once this way.
617	sub filename_to_url {
618	my ($text) =@_;
619
620	if($text !~ m/%25/) {
621	$text =~ s/%/%25/g;
622	}
623	return $text;
624	}
625
626
627	sub substr
628	{
629	my ($utf8_string, $offset, $length) = @_;
630
631	my @unicode_string = @{&utf82unicode($utf8_string)};
632	my $unicode_string_length = scalar(@unicode_string);
633
634	my $substr_start = $offset;
635	if ($substr_start >= $unicode_string_length) {
636	return "";
637	}
638
639	my $substr_end = $offset + $length - 1;
640	if ($substr_end >= $unicode_string_length) {
641	$substr_end = $unicode_string_length - 1;
642	}
643
644	my @unicode_substring = @unicode_string[$substr_start..$substr_end];
645	return &unicode2utf8(\@unicode_substring);
646	}
647
648
649	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: