Context Navigation

source: main/trunk/greenstone2/perllib/unicode.pm@ 27306

Last change on this file since 27306 was 27306, checked in by jmt12, 11 years ago
Moving the critical file-related functions (copy, rm, etc) out of util.pm into their own proper class FileUtils. Use of the old functions in util.pm will prompt deprecated warning messages. There may be further functions that could be moved across in the future, but these are the critical ones when considering supporting other filesystems (HTTP, HDFS, WebDav, etc). Updated some key files to use the new functions so now deprecated messages thrown when importing/building demo collection 'out of the box'
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 23.4 KB

Line
1	###########################################################################
2	#
3	# unicode.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2004 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# useful functions for dealing with Unicode
27
28	# Unicode strings are stored as arrays of scalars as perl
29	# lacks characters are 8-bit (currently)
30
31	package unicode;
32
33	eval {require bytes};
34
35	use encodings;
36	use strict;
37	use util;
38	use FileUtils;
39	use MIME::Base64; # for base64 encoding
40
41	no strict 'refs';
42
43
44
45	sub utf8decomp
46	{
47	my ($str) = @_;
48
49	return if (!defined $str);
50	return "" if ($str eq "");
51
52	my @unpacked_chars = unpack("C*", $str); # unpack Unicode characters
53
54	my @each_char
55	= map { ($_ > 255 )
56	? # if wide character...
57	sprintf("\\x{%04X}", $_)
58	: # \x{...}
59	(chr($_) =~ m/[[:cntrl:]]/ )
60	? # else if control character ...
61	sprintf("\\x%02X", $_)
62	: # \x..
63	quotemeta(chr($_)) # else quoted or as themselves
64	} @unpacked_chars;
65
66	return join("",@each_char);
67	}
68
69
70	sub hex_codepoint {
71	if (my $char = shift) {
72	return sprintf '%2.2x', unpack('U0U*', $char);
73	}
74	}
75
76
77
78
79	# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
80	# and returns a unicode array.
81	sub ascii2unicode {
82	my ($in) = @_;
83	my $out = [];
84
85	my $i = 0;
86	my $len = length($in);
87	while ($i < $len) {
88	push (@$out, ord(substr ($in, $i, 1)));
89	$i++;
90	}
91
92	return $out;
93	}
94
95	# ascii2utf8 takes a reference to an (extended) ascii string and returns a
96	# UTF-8 encoded string. This is just a faster version of
97	# "&unicode2utf8(&ascii2unicode($str));"
98	# "Extended ascii" really means "iso_8859_1"
99	sub ascii2utf8 {
100	my ($in) = @_;
101	my $out = "";
102
103	if (!defined($in)\|\| !defined($$in)) {
104	return $out;
105	}
106
107	my ($c);
108	my $i = 0;
109	my $len = length($$in);
110	while ($i < $len) {
111	$c = ord (substr ($$in, $i, 1));
112	if ($c < 0x80) {
113	# ascii character
114	$out .= chr ($c);
115
116	} else {
117	# extended ascii character
118	$out .= chr (0xc0 + (($c >> 6) & 0x1f));
119	$out .= chr (0x80 + ($c & 0x3f));
120	}
121	$i++;
122	}
123
124	return $out;
125	}
126
127	# unicode2utf8 takes a unicode array as input and encodes it
128	# using utf-8
129	sub unicode2utf8 {
130	my ($in) = @_;
131	my $out = "";
132
133	foreach my $num (@$in) {
134	next unless defined $num;
135	if ($num < 0x80) {
136	$out .= chr ($num);
137
138	} elsif ($num < 0x800) {
139	$out .= chr (0xc0 + (($num >> 6) & 0x1f));
140	$out .= chr (0x80 + ($num & 0x3f));
141
142	} elsif ($num < 0xFFFF) {
143	$out .= chr (0xe0 + (($num >> 12) & 0xf));
144	$out .= chr (0x80 + (($num >> 6) & 0x3f));
145	$out .= chr (0x80 + ($num & 0x3f));
146
147	} else {
148	# error, don't encode anything
149	die;
150	}
151	}
152	return $out;
153	}
154
155	# utf82unicode takes a utf-8 string and produces a unicode
156	# array
157	sub utf82unicode {
158	my ($in) = @_;
159	my $out = [];
160
161	my $i = 0;
162	my ($c1, $c2, $c3);
163	my $len = length($in);
164	while ($i < $len) {
165	if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
166	# normal ascii character
167	push (@$out, $c1);
168
169	} elsif ($c1 < 0xc0) {
170	# error, was expecting the first byte of an
171	# encoded character. Do nothing.
172
173	} elsif ($c1 < 0xe0 && $i+1 < $len) {
174	# an encoded character with two bytes
175	$c2 = ord (substr ($in, $i+1, 1));
176	if ($c2 >= 0x80 && $c2 < 0xc0) {
177	# everything looks ok
178	push (@$out, ((($c1 & 0x1f) << 6) +
179	($c2 & 0x3f)));
180	$i++; # gobbled an extra byte
181	}
182
183	} elsif ($c1 < 0xf0 && $i+2 < $len) {
184	# an encoded character with three bytes
185	$c2 = ord (substr ($in, $i+1, 1));
186	$c3 = ord (substr ($in, $i+2, 1));
187	if ($c2 >= 0x80 && $c2 < 0xc0 &&
188	$c3 >= 0x80 && $c3 < 0xc0) {
189	# everything looks ok
190	push (@$out, ((($c1 & 0xf) << 12) +
191	(($c2 & 0x3f) << 6) +
192	($c3 & 0x3f)));
193
194	$i += 2; # gobbled an extra two bytes
195	}
196
197	} else {
198	# error, only decode Unicode characters not full UCS.
199	# Do nothing.
200	}
201
202	$i++;
203	}
204
205	return $out;
206	}
207
208	# unicode2ucs2 takes a unicode array and produces a UCS-2
209	# unicode string (every two bytes forms a unicode character)
210	sub unicode2ucs2 {
211	my ($in) = @_;
212	my $out = "";
213
214	foreach my $num (@$in) {
215	$out .= chr (($num & 0xff00) >> 8);
216	$out .= chr ($num & 0xff);
217	}
218
219	return $out;
220	}
221
222	# ucs22unicode takes a UCS-2 string and produces a unicode array
223	sub ucs22unicode {
224	my ($in) = @_;
225	my $out = [];
226
227	my $i = 0;
228	my $len = length ($in);
229	while ($i+1 < $len) {
230	push (@$out, ord (substr($in, $i, 1)) << 8 +
231	ord (substr($in, $i+1, 1)));
232
233	$i ++;
234	}
235
236	return $out;
237	}
238
239	# takes a reference to a string and returns a reference to a unicode array
240	sub convert2unicode {
241	my ($encoding, $textref) = @_;
242
243	if (!defined $encodings::encodings->{$encoding}) {
244	print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
245	return [];
246	}
247
248	my $encodename = "$encoding-unicode";
249	my $enc_info = $encodings::encodings->{$encoding};
250	my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
251	"to_uc", $enc_info->{'mapfile'});
252	if (!&loadmapencoding ($encodename, $mapfile)) {
253	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
254	return [];
255	}
256
257	if (defined $enc_info->{'converter'}) {
258	my $converter = $enc_info->{'converter'};
259	return &$converter ($encodename, $textref);
260	}
261
262	if ($unicode::translations{$encodename}->{'count'} == 1) {
263	return &singlebyte2unicode ($encodename, $textref);
264	} else {
265	return &doublebyte2unicode ($encodename, $textref);
266	}
267	}
268
269	# singlebyte2unicode converts simple 8 bit encodings where characters below
270	# 0x80 are normal ascii characters and the rest are decoded using the
271	# appropriate mapping files.
272	#
273	# Examples of encodings that may be converted using singlebyte2unicode are
274	# the iso-8859 and windows-125* series.
275	sub singlebyte2unicode {
276	my ($encodename, $textref) = @_;
277
278	my @outtext = ();
279	my $len = length($$textref);
280	my ($c);
281	my $i = 0;
282
283	while ($i < $len) {
284	if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
285	# normal ascii character
286	push (@outtext, $c);
287	} else {
288	$c = &transchar ($encodename, $c);
289	# put a black square if cannot translate
290	$c = 0x25A1 if $c == 0;
291	push (@outtext, $c);
292	}
293	$i ++;
294	}
295	return \@outtext;
296	}
297
298	# doublebyte2unicode converts simple two byte encodings where characters
299	# below code point 0x80 are single-byte characters and the rest are
300	# double-byte characters.
301	#
302	# Examples of encodings that may be converted using doublebyte2unicode are
303	# CJK encodings like GB encoded Chinese and UHC Korean.
304	#
305	# Note that no error checking is performed to make sure that the input text
306	# is valid for the given encoding.
307	#
308	# Also, encodings that may contain characters of more than two bytes are
309	# not supported (any EUC encoded text may in theory contain 3-byte
310	# characters but in practice only one and two byte characters are used).
311	sub doublebyte2unicode {
312	my ($encodename, $textref) = @_;
313
314	my @outtext = ();
315	my $len = length($$textref);
316	my ($c1, $c2);
317	my $i = 0;
318
319	while ($i < $len) {
320	if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
321	if ($i+1 < $len) {
322	# double-byte character
323	$c2 = ord(substr($$textref, $i+1, 1));
324	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
325	# put a black square if cannot translate
326	$c = 0x25A1 if $c == 0;
327	push (@outtext, $c);
328	$i += 2;
329
330	} else {
331	# error
332	print STDERR "unicode: ERROR missing second half of double-byte character\n";
333	$i++;
334	}
335
336	} else {
337	# single-byte character
338	push (@outtext, $c1);
339	$i++;
340	}
341	}
342	return \@outtext;
343	}
344
345	# Shift-JIS to unicode
346	# We can't use doublebyte2unicode for Shift-JIS because it uses some
347	# single-byte characters above code point 0x80 (i.e. half-width katakana
348	# characters in the range 0xA1-0xDF)
349	sub shiftjis2unicode {
350	my ($encodename, $textref) = @_;
351
352	my @outtext = ();
353	my $len = length($$textref);
354	my ($c1, $c2);
355	my $i = 0;
356
357	while ($i < $len) {
358	$c1 = ord(substr($$textref, $i, 1));
359
360	if (($c1 >= 0xA1 && $c1 <= 0xDF) \|\| $c1 == 0x5c \|\| $c1 == 0x7E) {
361	# Single-byte half-width katakana character or
362	# JIS Roman yen or overline characters
363	my $c = &transchar ($encodename, $c1);
364	# - put a black square if cannot translate
365	$c = 0x25A1 if $c == 0;
366	push (@outtext, $c);
367	$i++;
368
369	} elsif ($c1 < 0x80) {
370	# ASCII
371	push (@outtext, $c1);
372	$i ++;
373
374	} elsif ($c1 < 0xEF) {
375	if ($i+1 < $len) {
376	$c2 = ord(substr($$textref, $i+1, 1));
377	if (($c2 >= 0x40 && $c2 <= 0x7E) \|\| ($c2 >= 0x80 && $c2 <= 0xFC)) {
378	# Double-byte shift-jis character
379	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
380	# put a black square if cannot translate
381	$c = 0x25A1 if $c == 0;
382	push (@outtext, $c);
383	} else {
384	# error
385	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
386	}
387	$i += 2;
388	} else {
389	# error
390	print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
391	$i ++;
392	}
393	} else {
394	# error
395	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
396	$i ++;
397	}
398	}
399	return \@outtext;
400	}
401
402	sub transchar {
403	my ($encoding, $from) = @_;
404	my $high = ($from / 256) % 256;
405	my $low = $from % 256;
406
407	return 0 unless defined $unicode::translations{$encoding};
408
409	my $block = $unicode::translations{$encoding}->{'map'};
410
411	if (ref ($block->[$high]) ne "ARRAY") {
412	return 0;
413	}
414	return $block->[$high]->[$low];
415	}
416
417	# %translations is of the form:
418	#
419	# encodings{encodingname-encodingname}->{'map'}->blocktranslation
420	# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
421	#
422	# Any of the top translation blocks can point to an undefined
423	# value. This data structure aims to allow fast translation and
424	# efficient storage.
425	%unicode::translations = ();
426
427	# @array256 is used for initialisation, there must be
428	# a better way...
429	# What about this?: @array256 = (0) x 256;
430	@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
431	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
432	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
433	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
434	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
435	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
436	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
437	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
438	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
439	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
440	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
441	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
442	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
443	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
444	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
445	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
446
447	# returns 1 if successful, 0 if unsuccessful
448	sub loadmapencoding {
449	my ($encoding, $mapfile) = @_;
450
451	# check to see if the encoding has already been loaded
452	return 1 if (defined $unicode::translations{$encoding});
453
454	if (! -r $mapfile \|\| -d $mapfile) {
455	return 0;
456	}
457	return 0 unless open (MAPFILE, $mapfile);
458	binmode (MAPFILE);
459
460	$unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
461	my $block = $unicode::translations{$encoding};
462
463	my ($in,$i,$j);
464	while (1) {
465	my $ret=read(MAPFILE, $in, 1);
466	if (!defined($ret)) { # error
467	print STDERR "unicode.pm: error reading mapfile: $!\n";
468	last;
469	}
470	if ($ret != 1) { last }
471	$i = unpack ("C", $in);
472	$block->{'map'}->[$i] = [@unicode::array256];
473	for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
474	my ($n1, $n2) = unpack ("CC", $in);
475	$block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
476	}
477	$block->{'count'} ++;
478	}
479
480	close (MAPFILE);
481	}
482
483	# unicode2singlebyte converts unicode to simple 8 bit encodings where
484	# characters below 0x80 are normal ascii characters and the rest are encoded
485	# using the appropriate mapping files.
486	#
487	# Examples of encodings that may be converted using unicode2singlebyte are
488	# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
489	sub unicode2singlebyte {
490	my ($uniref, $encoding) = @_;
491
492	my $outtext = "";
493	my $encodename = "unicode-$encoding";
494
495	if (!exists $encodings::encodings->{$encoding}) {
496	print STDERR "unicode.pm: ERROR - unsupported encoding "
497	. "'$encoding' requested\n";
498	return "";
499	}
500
501	my $enc_info = $encodings::encodings->{$encoding};
502	my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
503	"from_uc", $enc_info->{'mapfile'});
504	if (!&loadmapencoding ($encodename, $mapfile)) {
505	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
506	return "";
507	}
508
509	foreach my $c (@$uniref) {
510	if ($c < 0x80) {
511	# normal ascii character
512	$outtext .= chr($c);
513	} else {
514	# extended ascii character
515	$c = &transchar ($encodename, $c);
516
517	# put a question mark if cannot translate
518	if ($c == 0) {
519	$outtext .= "?";
520	} else {
521	$outtext .= chr($c);
522	}
523	}
524	}
525	return $outtext;
526	}
527
528
529	# this makes sure that the referenced input string is utf8 encoded, and
530	# will change/remove bytes that aren't.
531	# returns 0 if the text was already utf8, or 1 if text modified to become utf8
532	sub ensure_utf8 {
533	my $stringref=shift;
534
535	if (!defined($stringref) \|\| ref($stringref) ne 'SCALAR') {
536	return $stringref;
537	}
538
539	my $value=$$stringref;
540
541	my $non_utf8_found = 0;
542	$value =~ m/^/g; # to set \G
543	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
544	my $highbytes=$1;
545	my $highbyteslength=length($highbytes);
546	# make sure this block of high bytes is utf-8
547	$highbytes =~ /^/g; # set pos()
548	my $byte_replaced = 0;
549	while ($highbytes =~
550	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
551	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
552	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
553	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
554	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
555	)*([\x80-\xff])? !xg
556	) {
557	# this highbyte is "out-of-place" for valid utf-8
558	my $badbyte=$1;
559	if (!defined $badbyte) {next} # hit end of string
560	my $pos=pos($highbytes);
561	# replace bad byte. assume iso-8859-1 -> utf-8
562	# ascii2utf8 does "extended ascii"... ie iso-8859-1
563	my $replacement=&unicode::ascii2utf8(\$badbyte);
564	substr($highbytes, $pos-1, 1, $replacement);
565	# update the position to continue searching (for \G)
566	pos($highbytes) = $pos+length($replacement)-1;
567	$byte_replaced = 1;
568	}
569	if ($byte_replaced) {
570	# replace this block of high bytes in the $value
571	$non_utf8_found = 1;
572	my $replength=length($highbytes); # we've changed the length
573	my $textpos=pos($value); # pos at end of last match
574	# replace bad bytes with good bytes
575	substr($value, $textpos-$highbyteslength,
576	$highbyteslength, $highbytes);
577	# update the position to continue searching (for \G)
578	pos($value)=$textpos+($replength-$highbyteslength)+1;
579	}
580	}
581
582	$$stringref = $value;
583	return $non_utf8_found;
584	}
585
586	# Returns true (1) if the given string is utf8 and false (0) if it isn't.
587	# Does not modify the string parameter.
588	sub check_is_utf8 {
589	my $value=shift;
590
591	if (!defined($value)) {
592	return 0; # not utf8 because it is undefined
593	}
594
595	$value =~ m/^/g; # to set \G
596	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
597	my $highbytes=$1;
598	# make sure this block of high bytes is utf-8
599	$highbytes =~ /^/g; # set pos()
600	while ($highbytes =~
601	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
602	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
603	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
604	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
605	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
606	)*([\x80-\xff])? !xg
607	) {
608	my $badbyte=$1;
609	if (defined $badbyte) { # not end of string
610	return 0; # non-utf8 found
611	}
612	}
613	}
614
615	return 1;
616	}
617
618	sub url_encode {
619	my ($text) = @_;
620
621	if (!&is_url_encoded($text)) {
622	$text =~ s/([^0-9A-Z\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
623	# return the url-encoded character entity for underscore back to the entity
624	$text =~ s/%26%23095%3B/&\#095;/g;
625	}
626	return $text;
627	}
628
629	sub url_decode {
630	my ($text,$and_numeric_entities) = @_;
631
632	$text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige;
633
634	if ((defined $and_numeric_entities) && ($and_numeric_entities)) {
635	$text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
636	$text =~ s/\&\#u?([0-9]+);/pack('C', $1)/ige;
637	}
638
639	return $text;
640	}
641
642	sub url_decode_utf8 {
643	my ($text,$and_numeric_entities) = @_;
644
645	$text =~ s/\%([0-9A-F]{2})/pack('b', hex($1))/ige;
646
647	$text = Encode::decode("utf8",$text);
648
649	return $text;
650	}
651
652	sub is_url_encoded {
653	my ($text) = @_;
654	return ($text =~ m/\%([0-9A-F]{2})/i) \|\| ($text =~ m/\&\#x([0-9A-F]+;)/i) \|\| ($text =~ m/\&\#([0-9]+;)/i);
655	}
656
657	# When a filename on the filesystem is already URL-encoded, the
658	# URL to it will have %25s in place of every % sign, so that
659	# URLs in html pages can refer to the URL-encoded filename.
660	# This method changes the URL reference back into the actual
661	# (URL-encoded) filename on the filesystem by replacing %25 with %.
662	sub url_to_filename {
663	my ($text) =@_;
664	$text =~ s/%25/%/g if &is_url_encoded($text);
665	# DM safing would have replaced underscores with character entity _
666	# in SourceFile meta. Undo any such change to get the filename referred to.
667	$text =~ s/&\#095;/_/g;
668	return $text;
669	}
670
671	# When a filename on the filesystem is already URL-encoded, the
672	# URL to it will have %25s in place of every % sign, so that
673	# URLs in html pages can refer to the URL-encoded filename.
674	# Given a (URL-encoded) filename on the filesystem, this subroutine
675	# returns the URL reference string for it by replacing % with %25.
676	# The output string will be the same as the input string if the input
677	# already contains one or more %25s. This is to prevent processing
678	# a url more than once this way.
679	sub filename_to_url {
680	my ($text) = @_;
681
682	if($text !~ m/%25/) {
683	$text =~ s/%/%25/g;
684	}
685	return $text;
686	}
687
688	sub base64_encode {
689	my ($text) = @_;
690	if(!&conforms_to_mod_base64($text)) {
691	# return entity for underscore to underscore before encoding
692	$text =~ s/&\#095;/_/g;
693
694	$text = &MIME::Base64::encode_base64($text);
695	# base64 encoding may introduce + and / signs,
696	# replacing them with - and _ to ensure it's filename-safe
697	$text =~ s/\+/\-/g; # + -> -
698	$text =~ s/\//\_/g; # / -> _
699	}
700	return $text;
701	}
702
703	# If the input fits the modified base64 pattern, this will try decoding it.
704	# Still, this method does not guarantee the return value is the 'original', only
705	# that the result is where the base64 decoding process has been applied once.
706	# THIS METHOD IS NOT USED at the moment. It's here for convenience and symmetry.
707	sub base64_decode {
708	my ($text) = @_;
709	if(&conforms_to_mod_base64($text)) {
710	# base64 encodes certain chars with + and /, but if we'd encoded it, we'd
711	# have replaced them with - and _ respectively. Undo this before decoding.
712	$text =~ s/\-/\+/g; # - -> +
713	$text =~ s/\_/\//g; # _ -> /
714	$text = &MIME::Base64::decode_base64($text);
715	}
716	return $text;
717	}
718
719	# Returns true if the given string is compatible with a modified version
720	# of base64 (where the + and / are replaced with - and _), a format which
721	# includes also regular ASCII alphanumeric values. This method does not
722	# guarantee that the given string is actually base64 encoded, since it will
723	# return true for any simple alphanumeric ASCII string as well.
724	sub conforms_to_mod_base64 {
725	my ($text) = @_;
726
727	# need to treat the entity ref for underscore as underscore
728	$text =~ s/&\#095;/_/g;
729
730	# base 64 takes alphanumeric and [=+/],
731	# but we use modified base64 where + and / are replaced with - and _
732	return ($text =~ m/^[A-Za-z0-9\=\-\_]+$/); #alphanumeric and [=-_]
733	}
734
735	sub substr
736	{
737	my ($utf8_string, $offset, $length) = @_;
738
739	my @unicode_string = @{&utf82unicode($utf8_string)};
740	my $unicode_string_length = scalar(@unicode_string);
741
742	my $substr_start = $offset;
743	if ($substr_start >= $unicode_string_length) {
744	return "";
745	}
746
747	my $substr_end = $offset + $length - 1;
748	if ($substr_end >= $unicode_string_length) {
749	$substr_end = $unicode_string_length - 1;
750	}
751
752	my @unicode_substring = @unicode_string[$substr_start..$substr_end];
753	return &unicode2utf8(\@unicode_substring);
754	}
755
756	# Useful method to print UTF8 (or other unicode) for debugging.
757	# Characters that are easily displayed (that is, printable ASCII)
758	# are shown as-is, whereas hex values of the unicode code points
759	# are shown for all other chars.
760	sub debug_unicode_string
761	{
762	join("",
763	map { $_ > 127 ? # if wide character...
764	sprintf("\\x{%04X}", $_) : # \x{...}
765	chr($_)
766	} unpack("U*", $_[0])); # unpack Unicode characters
767	}
768
769
770	sub raw_filename_to_url_encoded
771	{
772	my ($str_in) = @_;
773
774	my @url_encoded_chars
775	= map { $_ > 255 ? # Needs to be represent in entity form
776	sprintf("&#x%X;",$_) :
777	$_>127 \|\| $_==ord("%") ? # Representable in %XX form
778	sprintf("%%%2X", $_) :
779	chr($_) # otherwise, Ascii char
780	} unpack("U*", $str_in); # Unpack Unicode characters
781
782
783	my $str_out = join("", @url_encoded_chars);
784
785	return $str_out;
786
787	}
788
789	sub url_encoded_to_raw_filename
790	{
791	my ($str_in) = @_;
792
793	my $str_out = $str_in;
794
795	$str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
796	$str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
797	$str_out =~ s/&#([0-9]+);/chr($1)/eig;
798
799	return $str_out;
800	}
801
802
803	sub raw_filename_to_utf8_url_encoded
804	{
805	my ($str_in) = @_;
806
807	$str_in = Encode::encode("utf8",$str_in) if !check_is_utf8($str_in);
808
809	my @url_encoded_chars
810	= map { $_ > 127 ? # Representable in %XX form
811	sprintf("%%%2X", $_) :
812	chr($_) # otherwise, Ascii char
813	} unpack("U*", $str_in); # Unpack utf8 characters
814
815
816	my $str_out = join("", @url_encoded_chars);
817
818	return $str_out;
819
820	}
821
822	sub utf8_url_encoded_to_raw_filename
823	{
824	my ($str_in) = @_;
825
826	my $utf8_str_out = $str_in;
827
828	$utf8_str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
829
830	my $unicode_str_out = decode("utf8",$utf8_str_out);
831	my $raw_str_out = utf8::downgrade($unicode_str_out);
832
833	return $raw_str_out;
834	}
835
836	sub analyze_raw_string
837	{
838	my ($str_in) = @_;
839
840	my $uses_bytecodes = 0;
841	my $exceeds_bytecodes = 0;
842
843	map { $exceeds_bytecodes = 1 if ($_ >= 256);
844	$uses_bytecodes = 1 if (($_ >= 128) && ($_ < 256));
845	} unpack("U*", $str_in); # Unpack Unicode characters
846
847	return ($uses_bytecodes,$exceeds_bytecodes);
848	}
849
850
851	sub convert_utf8_string_to_unicode_string
852	{
853	my $utf8_string = shift(@_);
854
855	my $unicode_string = "";
856	foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
857	$unicode_string .= chr($unicode_value);
858	}
859	return $unicode_string;
860	}
861
862	sub convert_unicode_string_to_utf8_string
863	{
864	my $unicode_string = shift(@_);
865
866	my @unicode_array;
867	for (my $i = 0; $i < length($unicode_string); $i++) {
868	push(@unicode_array, ord(&substr($unicode_string, $i, 1)));
869	}
870	return &unicode::unicode2utf8(\@unicode_array);
871	}
872
873
874	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: