Context Navigation

source: main/trunk/greenstone2/perllib/unicode.pm@ 23362

Last change on this file since 23362 was 23362, checked in by davidb, 13 years ago
Additional routines (and few upgraded) to help support Greenstone working with filenames under Windows when then go beyond Latin-1 and start turning up in their DOS abbreviated form (e.g. Test~1.txt)
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 21.5 KB

Line
1	###########################################################################
2	#
3	# unicode.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2004 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# useful functions for dealing with Unicode
27
28	# Unicode strings are stored as arrays of scalars as perl
29	# lacks characters are 8-bit (currently)
30
31	package unicode;
32
33	eval {require bytes};
34
35	use encodings;
36	use strict;
37	use util;
38	use MIME::Base64; # for base64 encoding
39
40	no strict 'refs';
41
42
43
44	sub utf8decomp
45	{
46	my ($str) = @_;
47
48	return if (!defined $str);
49	return "" if ($str eq "");
50
51	my @unpacked_chars = unpack("C*", $str); # unpack Unicode characters
52
53	my @each_char
54	= map { ($_ > 255 )
55	? # if wide character...
56	sprintf("\\x{%04X}", $_)
57	: # \x{...}
58	(chr($_) =~ m/[[:cntrl:]]/ )
59	? # else if control character ...
60	sprintf("\\x%02X", $_)
61	: # \x..
62	quotemeta(chr($_)) # else quoted or as themselves
63	} @unpacked_chars;
64
65	return join("",@each_char);
66	}
67
68
69	sub hex_codepoint {
70	if (my $char = shift) {
71	return sprintf '%2.2x', unpack('U0U*', $char);
72	}
73	}
74
75
76
77
78	# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
79	# and returns a unicode array.
80	sub ascii2unicode {
81	my ($in) = @_;
82	my $out = [];
83
84	my $i = 0;
85	my $len = length($in);
86	while ($i < $len) {
87	push (@$out, ord(substr ($in, $i, 1)));
88	$i++;
89	}
90
91	return $out;
92	}
93
94	# ascii2utf8 takes a reference to an (extended) ascii string and returns a
95	# UTF-8 encoded string. This is just a faster version of
96	# "&unicode2utf8(&ascii2unicode($str));"
97	# "Extended ascii" really means "iso_8859_1"
98	sub ascii2utf8 {
99	my ($in) = @_;
100	my $out = "";
101
102	if (!defined($in)\|\| !defined($$in)) {
103	return $out;
104	}
105
106	my ($c);
107	my $i = 0;
108	my $len = length($$in);
109	while ($i < $len) {
110	$c = ord (substr ($$in, $i, 1));
111	if ($c < 0x80) {
112	# ascii character
113	$out .= chr ($c);
114
115	} else {
116	# extended ascii character
117	$out .= chr (0xc0 + (($c >> 6) & 0x1f));
118	$out .= chr (0x80 + ($c & 0x3f));
119	}
120	$i++;
121	}
122
123	return $out;
124	}
125
126	# unicode2utf8 takes a unicode array as input and encodes it
127	# using utf-8
128	sub unicode2utf8 {
129	my ($in) = @_;
130	my $out = "";
131
132	foreach my $num (@$in) {
133	next unless defined $num;
134	if ($num < 0x80) {
135	$out .= chr ($num);
136
137	} elsif ($num < 0x800) {
138	$out .= chr (0xc0 + (($num >> 6) & 0x1f));
139	$out .= chr (0x80 + ($num & 0x3f));
140
141	} elsif ($num < 0xFFFF) {
142	$out .= chr (0xe0 + (($num >> 12) & 0xf));
143	$out .= chr (0x80 + (($num >> 6) & 0x3f));
144	$out .= chr (0x80 + ($num & 0x3f));
145
146	} else {
147	# error, don't encode anything
148	die;
149	}
150	}
151	return $out;
152	}
153
154	# utf82unicode takes a utf-8 string and produces a unicode
155	# array
156	sub utf82unicode {
157	my ($in) = @_;
158	my $out = [];
159
160	my $i = 0;
161	my ($c1, $c2, $c3);
162	my $len = length($in);
163	while ($i < $len) {
164	if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
165	# normal ascii character
166	push (@$out, $c1);
167
168	} elsif ($c1 < 0xc0) {
169	# error, was expecting the first byte of an
170	# encoded character. Do nothing.
171
172	} elsif ($c1 < 0xe0 && $i+1 < $len) {
173	# an encoded character with two bytes
174	$c2 = ord (substr ($in, $i+1, 1));
175	if ($c2 >= 0x80 && $c2 < 0xc0) {
176	# everything looks ok
177	push (@$out, ((($c1 & 0x1f) << 6) +
178	($c2 & 0x3f)));
179	$i++; # gobbled an extra byte
180	}
181
182	} elsif ($c1 < 0xf0 && $i+2 < $len) {
183	# an encoded character with three bytes
184	$c2 = ord (substr ($in, $i+1, 1));
185	$c3 = ord (substr ($in, $i+2, 1));
186	if ($c2 >= 0x80 && $c2 < 0xc0 &&
187	$c3 >= 0x80 && $c3 < 0xc0) {
188	# everything looks ok
189	push (@$out, ((($c1 & 0xf) << 12) +
190	(($c2 & 0x3f) << 6) +
191	($c3 & 0x3f)));
192
193	$i += 2; # gobbled an extra two bytes
194	}
195
196	} else {
197	# error, only decode Unicode characters not full UCS.
198	# Do nothing.
199	}
200
201	$i++;
202	}
203
204	return $out;
205	}
206
207	# unicode2ucs2 takes a unicode array and produces a UCS-2
208	# unicode string (every two bytes forms a unicode character)
209	sub unicode2ucs2 {
210	my ($in) = @_;
211	my $out = "";
212
213	foreach my $num (@$in) {
214	$out .= chr (($num & 0xff00) >> 8);
215	$out .= chr ($num & 0xff);
216	}
217
218	return $out;
219	}
220
221	# ucs22unicode takes a UCS-2 string and produces a unicode array
222	sub ucs22unicode {
223	my ($in) = @_;
224	my $out = [];
225
226	my $i = 0;
227	my $len = length ($in);
228	while ($i+1 < $len) {
229	push (@$out, ord (substr($in, $i, 1)) << 8 +
230	ord (substr($in, $i+1, 1)));
231
232	$i ++;
233	}
234
235	return $out;
236	}
237
238	# takes a reference to a string and returns a reference to a unicode array
239	sub convert2unicode {
240	my ($encoding, $textref) = @_;
241
242	if (!defined $encodings::encodings->{$encoding}) {
243	print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
244	return [];
245	}
246
247	my $encodename = "$encoding-unicode";
248	my $enc_info = $encodings::encodings->{$encoding};
249	my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
250	"to_uc", $enc_info->{'mapfile'});
251	if (!&loadmapencoding ($encodename, $mapfile)) {
252	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
253	return [];
254	}
255
256	if (defined $enc_info->{'converter'}) {
257	my $converter = $enc_info->{'converter'};
258	return &$converter ($encodename, $textref);
259	}
260
261	if ($unicode::translations{$encodename}->{'count'} == 1) {
262	return &singlebyte2unicode ($encodename, $textref);
263	} else {
264	return &doublebyte2unicode ($encodename, $textref);
265	}
266	}
267
268	# singlebyte2unicode converts simple 8 bit encodings where characters below
269	# 0x80 are normal ascii characters and the rest are decoded using the
270	# appropriate mapping files.
271	#
272	# Examples of encodings that may be converted using singlebyte2unicode are
273	# the iso-8859 and windows-125* series.
274	sub singlebyte2unicode {
275	my ($encodename, $textref) = @_;
276
277	my @outtext = ();
278	my $len = length($$textref);
279	my ($c);
280	my $i = 0;
281
282	while ($i < $len) {
283	if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
284	# normal ascii character
285	push (@outtext, $c);
286	} else {
287	$c = &transchar ($encodename, $c);
288	# put a black square if cannot translate
289	$c = 0x25A1 if $c == 0;
290	push (@outtext, $c);
291	}
292	$i ++;
293	}
294	return \@outtext;
295	}
296
297	# doublebyte2unicode converts simple two byte encodings where characters
298	# below code point 0x80 are single-byte characters and the rest are
299	# double-byte characters.
300	#
301	# Examples of encodings that may be converted using doublebyte2unicode are
302	# CJK encodings like GB encoded Chinese and UHC Korean.
303	#
304	# Note that no error checking is performed to make sure that the input text
305	# is valid for the given encoding.
306	#
307	# Also, encodings that may contain characters of more than two bytes are
308	# not supported (any EUC encoded text may in theory contain 3-byte
309	# characters but in practice only one and two byte characters are used).
310	sub doublebyte2unicode {
311	my ($encodename, $textref) = @_;
312
313	my @outtext = ();
314	my $len = length($$textref);
315	my ($c1, $c2);
316	my $i = 0;
317
318	while ($i < $len) {
319	if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
320	if ($i+1 < $len) {
321	# double-byte character
322	$c2 = ord(substr($$textref, $i+1, 1));
323	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
324	# put a black square if cannot translate
325	$c = 0x25A1 if $c == 0;
326	push (@outtext, $c);
327	$i += 2;
328
329	} else {
330	# error
331	print STDERR "unicode: ERROR missing second half of double-byte character\n";
332	$i++;
333	}
334
335	} else {
336	# single-byte character
337	push (@outtext, $c1);
338	$i++;
339	}
340	}
341	return \@outtext;
342	}
343
344	# Shift-JIS to unicode
345	# We can't use doublebyte2unicode for Shift-JIS because it uses some
346	# single-byte characters above code point 0x80 (i.e. half-width katakana
347	# characters in the range 0xA1-0xDF)
348	sub shiftjis2unicode {
349	my ($encodename, $textref) = @_;
350
351	my @outtext = ();
352	my $len = length($$textref);
353	my ($c1, $c2);
354	my $i = 0;
355
356	while ($i < $len) {
357	$c1 = ord(substr($$textref, $i, 1));
358
359	if (($c1 >= 0xA1 && $c1 <= 0xDF) \|\| $c1 == 0x5c \|\| $c1 == 0x7E) {
360	# Single-byte half-width katakana character or
361	# JIS Roman yen or overline characters
362	my $c = &transchar ($encodename, $c1);
363	# - put a black square if cannot translate
364	$c = 0x25A1 if $c == 0;
365	push (@outtext, $c);
366	$i++;
367
368	} elsif ($c1 < 0x80) {
369	# ASCII
370	push (@outtext, $c1);
371	$i ++;
372
373	} elsif ($c1 < 0xEF) {
374	if ($i+1 < $len) {
375	$c2 = ord(substr($$textref, $i+1, 1));
376	if (($c2 >= 0x40 && $c2 <= 0x7E) \|\| ($c2 >= 0x80 && $c2 <= 0xFC)) {
377	# Double-byte shift-jis character
378	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
379	# put a black square if cannot translate
380	$c = 0x25A1 if $c == 0;
381	push (@outtext, $c);
382	} else {
383	# error
384	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
385	}
386	$i += 2;
387	} else {
388	# error
389	print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
390	$i ++;
391	}
392	} else {
393	# error
394	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
395	$i ++;
396	}
397	}
398	return \@outtext;
399	}
400
401	sub transchar {
402	my ($encoding, $from) = @_;
403	my $high = ($from / 256) % 256;
404	my $low = $from % 256;
405
406	return 0 unless defined $unicode::translations{$encoding};
407
408	my $block = $unicode::translations{$encoding}->{'map'};
409
410	if (ref ($block->[$high]) ne "ARRAY") {
411	return 0;
412	}
413	return $block->[$high]->[$low];
414	}
415
416	# %translations is of the form:
417	#
418	# encodings{encodingname-encodingname}->{'map'}->blocktranslation
419	# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
420	#
421	# Any of the top translation blocks can point to an undefined
422	# value. This data structure aims to allow fast translation and
423	# efficient storage.
424	%unicode::translations = ();
425
426	# @array256 is used for initialisation, there must be
427	# a better way...
428	# What about this?: @array256 = (0) x 256;
429	@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
430	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
431	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
432	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
433	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
434	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
435	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
436	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
437	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
438	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
439	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
440	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
441	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
442	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
443	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
444	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
445
446	# returns 1 if successful, 0 if unsuccessful
447	sub loadmapencoding {
448	my ($encoding, $mapfile) = @_;
449
450	# check to see if the encoding has already been loaded
451	return 1 if (defined $unicode::translations{$encoding});
452
453	if (! -r $mapfile \|\| -d $mapfile) {
454	return 0;
455	}
456	return 0 unless open (MAPFILE, $mapfile);
457	binmode (MAPFILE);
458
459	$unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
460	my $block = $unicode::translations{$encoding};
461
462	my ($in,$i,$j);
463	while (1) {
464	my $ret=read(MAPFILE, $in, 1);
465	if (!defined($ret)) { # error
466	print STDERR "unicode.pm: error reading mapfile: $!\n";
467	last;
468	}
469	if ($ret != 1) { last }
470	$i = unpack ("C", $in);
471	$block->{'map'}->[$i] = [@unicode::array256];
472	for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
473	my ($n1, $n2) = unpack ("CC", $in);
474	$block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
475	}
476	$block->{'count'} ++;
477	}
478
479	close (MAPFILE);
480	}
481
482	# unicode2singlebyte converts unicode to simple 8 bit encodings where
483	# characters below 0x80 are normal ascii characters and the rest are encoded
484	# using the appropriate mapping files.
485	#
486	# Examples of encodings that may be converted using unicode2singlebyte are
487	# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
488	sub unicode2singlebyte {
489	my ($uniref, $encoding) = @_;
490
491	my $outtext = "";
492	my $encodename = "unicode-$encoding";
493
494	if (!exists $encodings::encodings->{$encoding}) {
495	print STDERR "unicode.pm: ERROR - unsupported encoding "
496	. "'$encoding' requested\n";
497	return "";
498	}
499
500	my $enc_info = $encodings::encodings->{$encoding};
501	my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
502	"from_uc", $enc_info->{'mapfile'});
503	if (!&loadmapencoding ($encodename, $mapfile)) {
504	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
505	return "";
506	}
507
508	foreach my $c (@$uniref) {
509	if ($c < 0x80) {
510	# normal ascii character
511	$outtext .= chr($c);
512	} else {
513	# extended ascii character
514	$c = &transchar ($encodename, $c);
515
516	# put a question mark if cannot translate
517	if ($c == 0) {
518	$outtext .= "?";
519	} else {
520	$outtext .= chr($c);
521	}
522	}
523	}
524	return $outtext;
525	}
526
527
528	# this makes sure that the referenced input string is utf8 encoded, and
529	# will change/remove bytes that aren't.
530	# returns 0 if the text was already utf8, or 1 if text modified to become utf8
531	sub ensure_utf8 {
532	my $stringref=shift;
533
534	if (!defined($stringref) \|\| ref($stringref) ne 'SCALAR') {
535	return $stringref;
536	}
537
538	my $value=$$stringref;
539
540	my $non_utf8_found = 0;
541	$value =~ m/^/g; # to set \G
542	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
543	my $highbytes=$1;
544	my $highbyteslength=length($highbytes);
545	# make sure this block of high bytes is utf-8
546	$highbytes =~ /^/g; # set pos()
547	my $byte_replaced = 0;
548	while ($highbytes =~
549	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
550	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
551	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
552	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
553	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
554	)*([\x80-\xff])? !xg
555	) {
556	# this highbyte is "out-of-place" for valid utf-8
557	my $badbyte=$1;
558	if (!defined $badbyte) {next} # hit end of string
559	my $pos=pos($highbytes);
560	# replace bad byte. assume iso-8859-1 -> utf-8
561	# ascii2utf8 does "extended ascii"... ie iso-8859-1
562	my $replacement=&unicode::ascii2utf8(\$badbyte);
563	substr($highbytes, $pos-1, 1, $replacement);
564	# update the position to continue searching (for \G)
565	pos($highbytes) = $pos+length($replacement)-1;
566	$byte_replaced = 1;
567	}
568	if ($byte_replaced) {
569	# replace this block of high bytes in the $value
570	$non_utf8_found = 1;
571	my $replength=length($highbytes); # we've changed the length
572	my $textpos=pos($value); # pos at end of last match
573	# replace bad bytes with good bytes
574	substr($value, $textpos-$highbyteslength,
575	$highbyteslength, $highbytes);
576	# update the position to continue searching (for \G)
577	pos($value)=$textpos+($replength-$highbyteslength)+1;
578	}
579	}
580
581	$$stringref = $value;
582	return $non_utf8_found;
583	}
584
585	# Returns true (1) if the given string is utf8 and false (0) if it isn't.
586	# Does not modify the string parameter.
587	sub check_is_utf8 {
588	my $value=shift;
589
590	if (!defined($value)) {
591	return 0; # not utf8 because it is undefined
592	}
593
594	$value =~ m/^/g; # to set \G
595	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
596	my $highbytes=$1;
597	# make sure this block of high bytes is utf-8
598	$highbytes =~ /^/g; # set pos()
599	while ($highbytes =~
600	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
601	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
602	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
603	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
604	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
605	)*([\x80-\xff])? !xg
606	) {
607	my $badbyte=$1;
608	if (defined $badbyte) { # not end of string
609	return 0; # non-utf8 found
610	}
611	}
612	}
613
614	return 1;
615	}
616
617	sub url_encode {
618	my ($text) = @_;
619
620	if (!&is_url_encoded($text)) {
621	$text =~ s/([^0-9A-Z\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
622	# return the url-encoded character entity for underscore back to the entity
623	$text =~ s/%26%23095%3B/&\#095;/g;
624	}
625	return $text;
626	}
627
628	sub url_decode {
629	my ($text) = @_;
630
631	$text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige;
632	$text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
633	$text =~ s/\&\#([0-9]+);/pack('C', $1)/ige;
634
635	return $text;
636	}
637
638	sub is_url_encoded {
639	my ($text) = @_;
640	return ($text =~ m/\%([0-9A-F]{2})/i) \|\| ($text =~ m/\&\#x([0-9A-F]+;)/i) \|\| ($text =~ m/\&\#([0-9]+;)/i);
641	}
642
643	# When a filename on the filesystem is already URL-encoded, the
644	# URL to it will have %25s in place of every % sign, so that
645	# URLs in html pages can refer to the URL-encoded filename.
646	# This method changes the URL reference back into the actual
647	# (URL-encoded) filename on the filesystem by replacing %25 with %.
648	sub url_to_filename {
649	my ($text) =@_;
650	$text =~ s/%25/%/g if &is_url_encoded($text);
651	# DM safing would have replaced underscores with character entity _
652	# in SourceFile meta. Undo any such change to get the filename referred to.
653	$text =~ s/&\#095;/_/g;
654	return $text;
655	}
656
657	# When a filename on the filesystem is already URL-encoded, the
658	# URL to it will have %25s in place of every % sign, so that
659	# URLs in html pages can refer to the URL-encoded filename.
660	# Given a (URL-encoded) filename on the filesystem, this subroutine
661	# returns the URL reference string for it by replacing % with %25.
662	# The output string will be the same as the input string if the input
663	# already contains one or more %25s. This is to prevent processing
664	# a url more than once this way.
665	sub filename_to_url {
666	my ($text) = @_;
667
668	if($text !~ m/%25/) {
669	$text =~ s/%/%25/g;
670	}
671	return $text;
672	}
673
674	sub base64_encode {
675	my ($text) = @_;
676	if(!&conforms_to_mod_base64($text)) {
677	# return entity for underscore to underscore before encoding
678	$text =~ s/&\#095;/_/g;
679
680	$text = &MIME::Base64::encode_base64($text);
681	# base64 encoding may introduce + and / signs,
682	# replacing them with - and _ to ensure it's filename-safe
683	$text =~ s/\+/\-/g; # + -> -
684	$text =~ s/\//\_/g; # / -> _
685	}
686	return $text;
687	}
688
689	# If the input fits the modified base64 pattern, this will try decoding it.
690	# Still, this method does not guarantee the return value is the 'original', only
691	# that the result is where the base64 decoding process has been applied once.
692	# THIS METHOD IS NOT USED at the moment. It's here for convenience and symmetry.
693	sub base64_decode {
694	my ($text) = @_;
695	if(&conforms_to_mod_base64($text)) {
696	# base64 encodes certain chars with + and /, but if we'd encoded it, we'd
697	# have replaced them with - and _ respectively. Undo this before decoding.
698	$text =~ s/\-/\+/g; # - -> +
699	$text =~ s/\_/\//g; # _ -> /
700	$text = &MIME::Base64::decode_base64($text);
701	}
702	return $text;
703	}
704
705	# Returns true if the given string is compatible with a modified version
706	# of base64 (where the + and / are replaced with - and _), a format which
707	# includes also regular ASCII alphanumeric values. This method does not
708	# guarantee that the given string is actually base64 encoded, since it will
709	# return true for any simple alphanumeric ASCII string as well.
710	sub conforms_to_mod_base64 {
711	my ($text) = @_;
712
713	# need to treat the entity ref for underscore as underscore
714	$text =~ s/&\#095;/_/g;
715
716	# base 64 takes alphanumeric and [=+/],
717	# but we use modified base64 where + and / are replaced with - and _
718	return ($text =~ m/^[A-Za-z0-9\=\-\_]+$/); #alphanumeric and [=-_]
719	}
720
721	sub substr
722	{
723	my ($utf8_string, $offset, $length) = @_;
724
725	my @unicode_string = @{&utf82unicode($utf8_string)};
726	my $unicode_string_length = scalar(@unicode_string);
727
728	my $substr_start = $offset;
729	if ($substr_start >= $unicode_string_length) {
730	return "";
731	}
732
733	my $substr_end = $offset + $length - 1;
734	if ($substr_end >= $unicode_string_length) {
735	$substr_end = $unicode_string_length - 1;
736	}
737
738	my @unicode_substring = @unicode_string[$substr_start..$substr_end];
739	return &unicode2utf8(\@unicode_substring);
740	}
741
742	# Useful method to print UTF8 (or other unicode) for debugging.
743	# Characters that are easily displayed (that is, printable ASCII)
744	# are shown as-is, whereas hex values of the unicode code points
745	# are shown for all other chars.
746	sub debug_unicode_string
747	{
748	join("",
749	map { $_ > 128 ? # if wide character...
750	sprintf("\\x{%04X}", $_) : # \x{...}
751	chr($_)
752	} unpack("U*", $_[0])); # unpack Unicode characters
753	}
754
755
756	sub raw_filename_to_url_encoded
757	{
758	my ($str_in) = @_;
759
760	my @url_encoded_chars
761	= map { $_ > 255 ? # Needs to be represent in entity form
762	sprintf("&#x%X;",$_) :
763	$_ > 128 ? # Representable in %XX form
764	sprintf("%%%2X", $_) :
765	chr($_) # otherwise, Ascii char
766	} unpack("U*", $str_in); # Unpack Unicode characters
767
768
769	my $str_out = join("", @url_encoded_chars);
770
771	return $str_out;
772
773	}
774
775
776	sub url_encoded_to_raw_filename
777	{
778	my ($str_in) = @_;
779
780	my $str_out = $str_in;
781
782	$str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
783	$str_out =~ s/&#([0-9]+);/chr($1)/eig;
784	$str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
785
786	return $str_out;
787	}
788
789	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: