Context Navigation

source: main/tags/2.81/gsdl/perllib/unicode.pm@ 25190

Last change on this file since 25190 was 18282, checked in by ak19, 15 years ago
Spaces in filenames are replaced with underscores just to be on the safe side. Tested that files with spaces in their names still work when using the Remote GS server and also work in the local case (such as mp3 and wmv files).
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 17.5 KB

Line
1	###########################################################################
2	#
3	# unicode.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2004 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# useful functions for dealing with Unicode
27
28	# Unicode strings are stored as arrays of scalars as perl
29	# lacks characters are 8-bit (currently)
30
31	package unicode;
32
33	eval {require bytes};
34
35	use encodings;
36	use strict;
37	use util;
38
39	no strict 'refs';
40
41	# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
42	# and returns a unicode array.
43	sub ascii2unicode {
44	my ($in) = @_;
45	my $out = [];
46
47	my $i = 0;
48	my $len = length($in);
49	while ($i < $len) {
50	push (@$out, ord(substr ($in, $i, 1)));
51	$i++;
52	}
53
54	return $out;
55	}
56
57	# ascii2utf8 takes a reference to an (extended) ascii string and returns a
58	# UTF-8 encoded string. This is just a faster version of
59	# "&unicode2utf8(&ascii2unicode($str));"
60	# "Extended ascii" really means "iso_8859_1"
61	sub ascii2utf8 {
62	my ($in) = @_;
63	my $out = "";
64
65	if (!defined($in)\|\| !defined($$in)) {
66	return $out;
67	}
68
69	my ($c);
70	my $i = 0;
71	my $len = length($$in);
72	while ($i < $len) {
73	$c = ord (substr ($$in, $i, 1));
74	if ($c < 0x80) {
75	# ascii character
76	$out .= chr ($c);
77
78	} else {
79	# extended ascii character
80	$out .= chr (0xc0 + (($c >> 6) & 0x1f));
81	$out .= chr (0x80 + ($c & 0x3f));
82	}
83	$i++;
84	}
85
86	return $out;
87	}
88
89	# unicode2utf8 takes a unicode array as input and encodes it
90	# using utf-8
91	sub unicode2utf8 {
92	my ($in) = @_;
93	my $out = "";
94
95	foreach my $num (@$in) {
96	next unless defined $num;
97	if ($num < 0x80) {
98	$out .= chr ($num);
99
100	} elsif ($num < 0x800) {
101	$out .= chr (0xc0 + (($num >> 6) & 0x1f));
102	$out .= chr (0x80 + ($num & 0x3f));
103
104	} elsif ($num < 0xFFFF) {
105	$out .= chr (0xe0 + (($num >> 12) & 0xf));
106	$out .= chr (0x80 + (($num >> 6) & 0x3f));
107	$out .= chr (0x80 + ($num & 0x3f));
108
109	} else {
110	# error, don't encode anything
111	die;
112	}
113	}
114	return $out;
115	}
116
117	# utf82unicode takes a utf-8 string and produces a unicode
118	# array
119	sub utf82unicode {
120	my ($in) = @_;
121	my $out = [];
122
123	my $i = 0;
124	my ($c1, $c2, $c3);
125	my $len = length($in);
126	while ($i < $len) {
127	if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
128	# normal ascii character
129	push (@$out, $c1);
130
131	} elsif ($c1 < 0xc0) {
132	# error, was expecting the first byte of an
133	# encoded character. Do nothing.
134
135	} elsif ($c1 < 0xe0 && $i+1 < $len) {
136	# an encoded character with two bytes
137	$c2 = ord (substr ($in, $i+1, 1));
138	if ($c2 >= 0x80 && $c2 < 0xc0) {
139	# everything looks ok
140	push (@$out, ((($c1 & 0x1f) << 6) +
141	($c2 & 0x3f)));
142	$i++; # gobbled an extra byte
143	}
144
145	} elsif ($c1 < 0xf0 && $i+2 < $len) {
146	# an encoded character with three bytes
147	$c2 = ord (substr ($in, $i+1, 1));
148	$c3 = ord (substr ($in, $i+2, 1));
149	if ($c2 >= 0x80 && $c2 < 0xc0 &&
150	$c3 >= 0x80 && $c3 < 0xc0) {
151	# everything looks ok
152	push (@$out, ((($c1 & 0xf) << 12) +
153	(($c2 & 0x3f) << 6) +
154	($c3 & 0x3f)));
155
156	$i += 2; # gobbled an extra two bytes
157	}
158
159	} else {
160	# error, only decode Unicode characters not full UCS.
161	# Do nothing.
162	}
163
164	$i++;
165	}
166
167	return $out;
168	}
169
170	# unicode2ucs2 takes a unicode array and produces a UCS-2
171	# unicode string (every two bytes forms a unicode character)
172	sub unicode2ucs2 {
173	my ($in) = @_;
174	my $out = "";
175
176	foreach my $num (@$in) {
177	$out .= chr (($num & 0xff00) >> 8);
178	$out .= chr ($num & 0xff);
179	}
180
181	return $out;
182	}
183
184	# ucs22unicode takes a UCS-2 string and produces a unicode array
185	sub ucs22unicode {
186	my ($in) = @_;
187	my $out = [];
188
189	my $i = 0;
190	my $len = length ($in);
191	while ($i+1 < $len) {
192	push (@$out, ord (substr($in, $i, 1)) << 8 +
193	ord (substr($in, $i+1, 1)));
194
195	$i ++;
196	}
197
198	return $out;
199	}
200
201	# takes a reference to a string and returns a reference to a unicode array
202	sub convert2unicode {
203	my ($encoding, $textref) = @_;
204
205	if (!defined $encodings::encodings->{$encoding}) {
206	print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
207	return [];
208	}
209
210	my $encodename = "$encoding-unicode";
211	my $enc_info = $encodings::encodings->{$encoding};
212	my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
213	"to_uc", $enc_info->{'mapfile'});
214	if (!&loadmapencoding ($encodename, $mapfile)) {
215	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
216	return [];
217	}
218
219	if (defined $enc_info->{'converter'}) {
220	my $converter = $enc_info->{'converter'};
221	return &$converter ($encodename, $textref);
222	}
223
224	if ($unicode::translations{$encodename}->{'count'} == 1) {
225	return &singlebyte2unicode ($encodename, $textref);
226	} else {
227	return &doublebyte2unicode ($encodename, $textref);
228	}
229	}
230
231	# singlebyte2unicode converts simple 8 bit encodings where characters below
232	# 0x80 are normal ascii characters and the rest are decoded using the
233	# appropriate mapping files.
234	#
235	# Examples of encodings that may be converted using singlebyte2unicode are
236	# the iso-8859 and windows-125* series.
237	sub singlebyte2unicode {
238	my ($encodename, $textref) = @_;
239
240	my @outtext = ();
241	my $len = length($$textref);
242	my ($c);
243	my $i = 0;
244
245	while ($i < $len) {
246	if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
247	# normal ascii character
248	push (@outtext, $c);
249	} else {
250	$c = &transchar ($encodename, $c);
251	# put a black square if cannot translate
252	$c = 0x25A1 if $c == 0;
253	push (@outtext, $c);
254	}
255	$i ++;
256	}
257	return \@outtext;
258	}
259
260	# doublebyte2unicode converts simple two byte encodings where characters
261	# below code point 0x80 are single-byte characters and the rest are
262	# double-byte characters.
263	#
264	# Examples of encodings that may be converted using doublebyte2unicode are
265	# CJK encodings like GB encoded Chinese and UHC Korean.
266	#
267	# Note that no error checking is performed to make sure that the input text
268	# is valid for the given encoding.
269	#
270	# Also, encodings that may contain characters of more than two bytes are
271	# not supported (any EUC encoded text may in theory contain 3-byte
272	# characters but in practice only one and two byte characters are used).
273	sub doublebyte2unicode {
274	my ($encodename, $textref) = @_;
275
276	my @outtext = ();
277	my $len = length($$textref);
278	my ($c1, $c2);
279	my $i = 0;
280
281	while ($i < $len) {
282	if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
283	if ($i+1 < $len) {
284	# double-byte character
285	$c2 = ord(substr($$textref, $i+1, 1));
286	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
287	# put a black square if cannot translate
288	$c = 0x25A1 if $c == 0;
289	push (@outtext, $c);
290	$i += 2;
291
292	} else {
293	# error
294	print STDERR "unicode: ERROR missing second half of double-byte character\n";
295	$i++;
296	}
297
298	} else {
299	# single-byte character
300	push (@outtext, $c1);
301	$i++;
302	}
303	}
304	return \@outtext;
305	}
306
307	# Shift-JIS to unicode
308	# We can't use doublebyte2unicode for Shift-JIS because it uses some
309	# single-byte characters above code point 0x80 (i.e. half-width katakana
310	# characters in the range 0xA1-0xDF)
311	sub shiftjis2unicode {
312	my ($encodename, $textref) = @_;
313
314	my @outtext = ();
315	my $len = length($$textref);
316	my ($c1, $c2);
317	my $i = 0;
318
319	while ($i < $len) {
320	$c1 = ord(substr($$textref, $i, 1));
321
322	if (($c1 >= 0xA1 && $c1 <= 0xDF) \|\| $c1 == 0x5c \|\| $c1 == 0x7E) {
323	# Single-byte half-width katakana character or
324	# JIS Roman yen or overline characters
325	my $c = &transchar ($encodename, $c1);
326	# - put a black square if cannot translate
327	$c = 0x25A1 if $c == 0;
328	push (@outtext, $c);
329	$i++;
330
331	} elsif ($c1 < 0x80) {
332	# ASCII
333	push (@outtext, $c1);
334	$i ++;
335
336	} elsif ($c1 < 0xEF) {
337	if ($i+1 < $len) {
338	$c2 = ord(substr($$textref, $i+1, 1));
339	if (($c2 >= 0x40 && $c2 <= 0x7E) \|\| ($c2 >= 0x80 && $c2 <= 0xFC)) {
340	# Double-byte shift-jis character
341	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
342	# put a black square if cannot translate
343	$c = 0x25A1 if $c == 0;
344	push (@outtext, $c);
345	} else {
346	# error
347	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
348	}
349	$i += 2;
350	} else {
351	# error
352	print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
353	$i ++;
354	}
355	} else {
356	# error
357	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
358	$i ++;
359	}
360	}
361	return \@outtext;
362	}
363
364	sub transchar {
365	my ($encoding, $from) = @_;
366	my $high = ($from / 256) % 256;
367	my $low = $from % 256;
368
369	return 0 unless defined $unicode::translations{$encoding};
370
371	my $block = $unicode::translations{$encoding}->{'map'};
372
373	if (ref ($block->[$high]) ne "ARRAY") {
374	return 0;
375	}
376	return $block->[$high]->[$low];
377	}
378
379	# %translations is of the form:
380	#
381	# encodings{encodingname-encodingname}->{'map'}->blocktranslation
382	# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
383	#
384	# Any of the top translation blocks can point to an undefined
385	# value. This data structure aims to allow fast translation and
386	# efficient storage.
387	%unicode::translations = ();
388
389	# @array256 is used for initialisation, there must be
390	# a better way...
391	# What about this?: @array256 = (0) x 256;
392	@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
393	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
394	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
395	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
396	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
397	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
398	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
399	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
400	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
401	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
402	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
403	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
404	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
405	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
406	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
407	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
408
409	# returns 1 if successful, 0 if unsuccessful
410	sub loadmapencoding {
411	my ($encoding, $mapfile) = @_;
412
413	# check to see if the encoding has already been loaded
414	return 1 if (defined $unicode::translations{$encoding});
415
416	if (! -r $mapfile \|\| -d $mapfile) {
417	return 0;
418	}
419	return 0 unless open (MAPFILE, $mapfile);
420	binmode (MAPFILE);
421
422	$unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
423	my $block = $unicode::translations{$encoding};
424
425	my ($in,$i,$j);
426	while (1) {
427	my $ret=read(MAPFILE, $in, 1);
428	if (!defined($ret)) { # error
429	print STDERR "unicode.pm: error reading mapfile: $!\n";
430	last;
431	}
432	if ($ret != 1) { last }
433	$i = unpack ("C", $in);
434	$block->{'map'}->[$i] = [@unicode::array256];
435	for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
436	my ($n1, $n2) = unpack ("CC", $in);
437	$block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
438	}
439	$block->{'count'} ++;
440	}
441
442	close (MAPFILE);
443	}
444
445	# unicode2singlebyte converts unicode to simple 8 bit encodings where
446	# characters below 0x80 are normal ascii characters and the rest are encoded
447	# using the appropriate mapping files.
448	#
449	# Examples of encodings that may be converted using unicode2singlebyte are
450	# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
451	sub unicode2singlebyte {
452	my ($uniref, $encoding) = @_;
453
454	my $outtext = "";
455	my $encodename = "unicode-$encoding";
456
457	if (!exists $encodings::encodings->{$encoding}) {
458	print STDERR "unicode.pm: ERROR - unsupported encoding "
459	. "'$encoding' requested\n";
460	return "";
461	}
462
463	my $enc_info = $encodings::encodings->{$encoding};
464	my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
465	"from_uc", $enc_info->{'mapfile'});
466	if (!&loadmapencoding ($encodename, $mapfile)) {
467	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
468	return "";
469	}
470
471	foreach my $c (@$uniref) {
472	if ($c < 0x80) {
473	# normal ascii character
474	$outtext .= chr($c);
475	} else {
476	# extended ascii character
477	$c = &transchar ($encodename, $c);
478
479	# put a question mark if cannot translate
480	if ($c == 0) {
481	$outtext .= "?";
482	} else {
483	$outtext .= chr($c);
484	}
485	}
486	}
487	return $outtext;
488	}
489
490
491	# this makes sure that the referenced input string is utf8 encoded, and
492	# will change/remove bytes that aren't.
493	# returns 0 if the text was already utf8, or 1 if text modified to become utf8
494	sub ensure_utf8 {
495	my $stringref=shift;
496
497	if (!defined($stringref) \|\| ref($stringref) ne 'SCALAR') {
498	return $stringref;
499	}
500
501	my $value=$$stringref;
502
503	my $non_utf8_found = 0;
504	$value =~ m/^/g; # to set \G
505	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
506	my $highbytes=$1;
507	my $highbyteslength=length($highbytes);
508	# make sure this block of high bytes is utf-8
509	$highbytes =~ /^/g; # set pos()
510	my $byte_replaced = 0;
511	while ($highbytes =~
512	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
513	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
514	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
515	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
516	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
517	)*([\x80-\xff])? !xg
518	) {
519	# this highbyte is "out-of-place" for valid utf-8
520	my $badbyte=$1;
521	if (!defined $badbyte) {next} # hit end of string
522	my $pos=pos($highbytes);
523	# replace bad byte. assume iso-8859-1 -> utf-8
524	# ascii2utf8 does "extended ascii"... ie iso-8859-1
525	my $replacement=&unicode::ascii2utf8(\$badbyte);
526	substr($highbytes, $pos-1, 1, $replacement);
527	# update the position to continue searching (for \G)
528	pos($highbytes) = $pos+length($replacement)-1;
529	$byte_replaced = 1;
530	}
531	if ($byte_replaced) {
532	# replace this block of high bytes in the $value
533	$non_utf8_found = 1;
534	my $replength=length($highbytes); # we've changed the length
535	my $textpos=pos($value); # pos at end of last match
536	# replace bad bytes with good bytes
537	substr($value, $textpos-$highbyteslength,
538	$highbyteslength, $highbytes);
539	# update the position to continue searching (for \G)
540	pos($value)=$textpos+($replength-$highbyteslength)+1;
541	}
542	}
543
544	$$stringref = $value;
545	return $non_utf8_found;
546	}
547
548	# Returns true (1) if the given string is utf8 and false (0) if it isn't.
549	# Does not modify the string parameter.
550	sub check_is_utf8 {
551	my $value=shift;
552
553	if (!defined($value)) {
554	return 0; # not utf8 because it is undefined
555	}
556
557	$value =~ m/^/g; # to set \G
558	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
559	my $highbytes=$1;
560	# make sure this block of high bytes is utf-8
561	$highbytes =~ /^/g; # set pos()
562	while ($highbytes =~
563	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
564	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
565	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
566	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
567	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
568	)*([\x80-\xff])? !xg
569	) {
570	my $badbyte=$1;
571	if (defined $badbyte) { # not end of string
572	return 0; # non-utf8 found
573	}
574	}
575	}
576
577	return 1;
578	}
579
580	sub url_encode {
581	my ($text) = @_;
582	if (!&is_url_encoded($text)) {
583	$text =~ s/ /_/g; # spaces replaced with underscore
584	$text =~ s/([^A-Z0-9\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
585	}
586	return $text;
587	}
588
589	sub url_decode {
590	my ($text) = @_;
591
592	$text =~ s/\%([A-F0-9]{2})/pack('C', hex($1))/ige;
593	return $text;
594	}
595
596	sub is_url_encoded {
597	my ($text) = @_;
598	return ($text =~ m/\%([A-F0-9]{2})/);
599	}
600
601	# When a filename on the filesystem is already URL-encoded, the
602	# URL to it will have %25s in in place of every % sign, so that
603	# URLs in html pages can refer to the URL-encoded filename.
604	# This method changes the URL reference back into the actual
605	# (URL-encoded) filename on the filesystem by replacing %25 with %.
606	sub url_to_filename {
607	my ($text) =@_;
608	$text =~ s/%25/%/g;
609	return $text;
610	}
611
612	# When a filename on the filesystem is already URL-encoded, the
613	# URL to it will have %25s in in place of every % sign, so that
614	# URLs in html pages can refer to the URL-encoded filename.
615	# Given a (URL-encoded) filename on the filesystem, this subroutine
616	# returns the URL reference string for it by replacing % with %25.
617	# The output string will be the same as the input string if the input
618	# already contains one or more %25s. This is to prevent processing
619	# a url more than once this way.
620	sub filename_to_url {
621	my ($text) =@_;
622
623	if($text !~ m/%25/) {
624	$text =~ s/%/%25/g;
625	}
626	return $text;
627	}
628
629
630	sub substr
631	{
632	my ($utf8_string, $offset, $length) = @_;
633
634	my @unicode_string = @{&utf82unicode($utf8_string)};
635	my $unicode_string_length = scalar(@unicode_string);
636
637	my $substr_start = $offset;
638	if ($substr_start >= $unicode_string_length) {
639	return "";
640	}
641
642	my $substr_end = $offset + $length - 1;
643	if ($substr_end >= $unicode_string_length) {
644	$substr_end = $unicode_string_length - 1;
645	}
646
647	my @unicode_substring = @unicode_string[$substr_start..$substr_end];
648	return &unicode2utf8(\@unicode_substring);
649	}
650
651
652	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: