Context Navigation

source: main/trunk/greenstone2/perllib/unicode.pm@ 32594

Last change on this file since 32594 was 28796, checked in by ak19, 10 years ago
Diego's Spano's bugfix to bug found by Justin Cooper when Justin was trying to process a problematic PDF. The mailing list documents their discussion.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 23.7 KB

Line
1	###########################################################################
2	#
3	# unicode.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2004 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# useful functions for dealing with Unicode
27
28	# Unicode strings are stored as arrays of scalars as perl
29	# lacks characters are 8-bit (currently)
30
31	package unicode;
32
33	eval {require bytes};
34
35	use encodings;
36	use strict;
37	use util;
38	use FileUtils;
39	use MIME::Base64; # for base64 encoding
40
41	no strict 'refs';
42
43
44
45	sub utf8decomp
46	{
47	my ($str) = @_;
48
49	return if (!defined $str);
50	return "" if ($str eq "");
51
52	my @unpacked_chars = unpack("C*", $str); # unpack Unicode characters
53
54	my @each_char
55	= map { ($_ > 255 )
56	? # if wide character...
57	sprintf("\\x{%04X}", $_)
58	: # \x{...}
59	(chr($_) =~ m/[[:cntrl:]]/ )
60	? # else if control character ...
61	sprintf("\\x%02X", $_)
62	: # \x..
63	quotemeta(chr($_)) # else quoted or as themselves
64	} @unpacked_chars;
65
66	return join("",@each_char);
67	}
68
69
70	sub hex_codepoint {
71	if (my $char = shift) {
72	return sprintf '%2.2x', unpack('U0U*', $char);
73	}
74	}
75
76
77
78
79	# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
80	# and returns a unicode array.
81	sub ascii2unicode {
82	my ($in) = @_;
83	my $out = [];
84
85	my $i = 0;
86	my $len = length($in);
87	while ($i < $len) {
88	push (@$out, ord(substr ($in, $i, 1)));
89	$i++;
90	}
91
92	return $out;
93	}
94
95	# ascii2utf8 takes a reference to an (extended) ascii string and returns a
96	# UTF-8 encoded string. This is just a faster version of
97	# "&unicode2utf8(&ascii2unicode($str));"
98	# "Extended ascii" really means "iso_8859_1"
99	sub ascii2utf8 {
100	my ($in) = @_;
101	my $out = "";
102
103	if (!defined($in)\|\| !defined($$in)) {
104	return $out;
105	}
106
107	my ($c);
108	my $i = 0;
109	my $len = length($$in);
110	while ($i < $len) {
111	$c = ord (substr ($$in, $i, 1));
112	if ($c < 0x80) {
113	# ascii character
114	$out .= chr ($c);
115
116	} else {
117	# extended ascii character
118	$out .= chr (0xc0 + (($c >> 6) & 0x1f));
119	$out .= chr (0x80 + ($c & 0x3f));
120	}
121	$i++;
122	}
123
124	return $out;
125	}
126
127	# unicode2utf8 takes a unicode array as input and encodes it
128	# using utf-8
129	sub unicode2utf8 {
130	my ($in) = @_;
131	my $out = "";
132
133	foreach my $num (@$in) {
134	next unless defined $num;
135	if ($num < 0x80) {
136	$out .= chr ($num);
137
138	} elsif ($num < 0x800) {
139	$out .= chr (0xc0 + (($num >> 6) & 0x1f));
140	$out .= chr (0x80 + ($num & 0x3f));
141
142	} elsif ($num < 0xFFFF) {
143	$out .= chr (0xe0 + (($num >> 12) & 0xf));
144	$out .= chr (0x80 + (($num >> 6) & 0x3f));
145	$out .= chr (0x80 + ($num & 0x3f));
146
147	} else {
148	# error, don't encode anything
149	#die;
150	# Diego's bugfix: instead of aborting the import process, it
151	# is better to get a converted file with a few extra spaces
152	print STDERR "strange char: $num\n";
153	$out .= " ";
154
155	}
156	}
157	return $out;
158	}
159
160	# utf82unicode takes a utf-8 string and produces a unicode
161	# array
162	sub utf82unicode {
163	my ($in) = @_;
164	my $out = [];
165
166	if(!defined $in) {
167	return $out;
168	}
169
170	my $i = 0;
171	my ($c1, $c2, $c3);
172	my $len = length($in);
173	while ($i < $len) {
174	if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
175	# normal ascii character
176	push (@$out, $c1);
177
178	} elsif ($c1 < 0xc0) {
179	# error, was expecting the first byte of an
180	# encoded character. Do nothing.
181
182	} elsif ($c1 < 0xe0 && $i+1 < $len) {
183	# an encoded character with two bytes
184	$c2 = ord (substr ($in, $i+1, 1));
185	if ($c2 >= 0x80 && $c2 < 0xc0) {
186	# everything looks ok
187	push (@$out, ((($c1 & 0x1f) << 6) +
188	($c2 & 0x3f)));
189	$i++; # gobbled an extra byte
190	}
191
192	} elsif ($c1 < 0xf0 && $i+2 < $len) {
193	# an encoded character with three bytes
194	$c2 = ord (substr ($in, $i+1, 1));
195	$c3 = ord (substr ($in, $i+2, 1));
196	if ($c2 >= 0x80 && $c2 < 0xc0 &&
197	$c3 >= 0x80 && $c3 < 0xc0) {
198	# everything looks ok
199	push (@$out, ((($c1 & 0xf) << 12) +
200	(($c2 & 0x3f) << 6) +
201	($c3 & 0x3f)));
202
203	$i += 2; # gobbled an extra two bytes
204	}
205
206	} else {
207	# error, only decode Unicode characters not full UCS.
208	# Do nothing.
209	}
210
211	$i++;
212	}
213
214	return $out;
215	}
216
217	# unicode2ucs2 takes a unicode array and produces a UCS-2
218	# unicode string (every two bytes forms a unicode character)
219	sub unicode2ucs2 {
220	my ($in) = @_;
221	my $out = "";
222
223	foreach my $num (@$in) {
224	$out .= chr (($num & 0xff00) >> 8);
225	$out .= chr ($num & 0xff);
226	}
227
228	return $out;
229	}
230
231	# ucs22unicode takes a UCS-2 string and produces a unicode array
232	sub ucs22unicode {
233	my ($in) = @_;
234	my $out = [];
235
236	my $i = 0;
237	my $len = length ($in);
238	while ($i+1 < $len) {
239	push (@$out, ord (substr($in, $i, 1)) << 8 +
240	ord (substr($in, $i+1, 1)));
241
242	$i ++;
243	}
244
245	return $out;
246	}
247
248	# takes a reference to a string and returns a reference to a unicode array
249	sub convert2unicode {
250	my ($encoding, $textref) = @_;
251
252	if (!defined $encodings::encodings->{$encoding}) {
253	print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
254	return [];
255	}
256
257	my $encodename = "$encoding-unicode";
258	my $enc_info = $encodings::encodings->{$encoding};
259	my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
260	"to_uc", $enc_info->{'mapfile'});
261	if (!&loadmapencoding ($encodename, $mapfile)) {
262	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
263	return [];
264	}
265
266	if (defined $enc_info->{'converter'}) {
267	my $converter = $enc_info->{'converter'};
268	return &$converter ($encodename, $textref);
269	}
270
271	if ($unicode::translations{$encodename}->{'count'} == 1) {
272	return &singlebyte2unicode ($encodename, $textref);
273	} else {
274	return &doublebyte2unicode ($encodename, $textref);
275	}
276	}
277
278	# singlebyte2unicode converts simple 8 bit encodings where characters below
279	# 0x80 are normal ascii characters and the rest are decoded using the
280	# appropriate mapping files.
281	#
282	# Examples of encodings that may be converted using singlebyte2unicode are
283	# the iso-8859 and windows-125* series.
284	sub singlebyte2unicode {
285	my ($encodename, $textref) = @_;
286
287	my @outtext = ();
288	my $len = length($$textref);
289	my ($c);
290	my $i = 0;
291
292	while ($i < $len) {
293	if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
294	# normal ascii character
295	push (@outtext, $c);
296	} else {
297	$c = &transchar ($encodename, $c);
298	# put a black square if cannot translate
299	$c = 0x25A1 if $c == 0;
300	push (@outtext, $c);
301	}
302	$i ++;
303	}
304	return \@outtext;
305	}
306
307	# doublebyte2unicode converts simple two byte encodings where characters
308	# below code point 0x80 are single-byte characters and the rest are
309	# double-byte characters.
310	#
311	# Examples of encodings that may be converted using doublebyte2unicode are
312	# CJK encodings like GB encoded Chinese and UHC Korean.
313	#
314	# Note that no error checking is performed to make sure that the input text
315	# is valid for the given encoding.
316	#
317	# Also, encodings that may contain characters of more than two bytes are
318	# not supported (any EUC encoded text may in theory contain 3-byte
319	# characters but in practice only one and two byte characters are used).
320	sub doublebyte2unicode {
321	my ($encodename, $textref) = @_;
322
323	my @outtext = ();
324	my $len = length($$textref);
325	my ($c1, $c2);
326	my $i = 0;
327
328	while ($i < $len) {
329	if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
330	if ($i+1 < $len) {
331	# double-byte character
332	$c2 = ord(substr($$textref, $i+1, 1));
333	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
334	# put a black square if cannot translate
335	$c = 0x25A1 if $c == 0;
336	push (@outtext, $c);
337	$i += 2;
338
339	} else {
340	# error
341	print STDERR "unicode: ERROR missing second half of double-byte character\n";
342	$i++;
343	}
344
345	} else {
346	# single-byte character
347	push (@outtext, $c1);
348	$i++;
349	}
350	}
351	return \@outtext;
352	}
353
354	# Shift-JIS to unicode
355	# We can't use doublebyte2unicode for Shift-JIS because it uses some
356	# single-byte characters above code point 0x80 (i.e. half-width katakana
357	# characters in the range 0xA1-0xDF)
358	sub shiftjis2unicode {
359	my ($encodename, $textref) = @_;
360
361	my @outtext = ();
362	my $len = length($$textref);
363	my ($c1, $c2);
364	my $i = 0;
365
366	while ($i < $len) {
367	$c1 = ord(substr($$textref, $i, 1));
368
369	if (($c1 >= 0xA1 && $c1 <= 0xDF) \|\| $c1 == 0x5c \|\| $c1 == 0x7E) {
370	# Single-byte half-width katakana character or
371	# JIS Roman yen or overline characters
372	my $c = &transchar ($encodename, $c1);
373	# - put a black square if cannot translate
374	$c = 0x25A1 if $c == 0;
375	push (@outtext, $c);
376	$i++;
377
378	} elsif ($c1 < 0x80) {
379	# ASCII
380	push (@outtext, $c1);
381	$i ++;
382
383	} elsif ($c1 < 0xEF) {
384	if ($i+1 < $len) {
385	$c2 = ord(substr($$textref, $i+1, 1));
386	if (($c2 >= 0x40 && $c2 <= 0x7E) \|\| ($c2 >= 0x80 && $c2 <= 0xFC)) {
387	# Double-byte shift-jis character
388	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
389	# put a black square if cannot translate
390	$c = 0x25A1 if $c == 0;
391	push (@outtext, $c);
392	} else {
393	# error
394	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
395	}
396	$i += 2;
397	} else {
398	# error
399	print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
400	$i ++;
401	}
402	} else {
403	# error
404	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
405	$i ++;
406	}
407	}
408	return \@outtext;
409	}
410
411	sub transchar {
412	my ($encoding, $from) = @_;
413	my $high = ($from / 256) % 256;
414	my $low = $from % 256;
415
416	return 0 unless defined $unicode::translations{$encoding};
417
418	my $block = $unicode::translations{$encoding}->{'map'};
419
420	if (ref ($block->[$high]) ne "ARRAY") {
421	return 0;
422	}
423	return $block->[$high]->[$low];
424	}
425
426	# %translations is of the form:
427	#
428	# encodings{encodingname-encodingname}->{'map'}->blocktranslation
429	# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
430	#
431	# Any of the top translation blocks can point to an undefined
432	# value. This data structure aims to allow fast translation and
433	# efficient storage.
434	%unicode::translations = ();
435
436	# @array256 is used for initialisation, there must be
437	# a better way...
438	# What about this?: @array256 = (0) x 256;
439	@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
440	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
441	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
442	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
443	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
444	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
445	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
446	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
447	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
448	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
449	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
450	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
451	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
452	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
453	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
454	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
455
456	# returns 1 if successful, 0 if unsuccessful
457	sub loadmapencoding {
458	my ($encoding, $mapfile) = @_;
459
460	# check to see if the encoding has already been loaded
461	return 1 if (defined $unicode::translations{$encoding});
462
463	if (! -r $mapfile \|\| -d $mapfile) {
464	return 0;
465	}
466	return 0 unless open (MAPFILE, $mapfile);
467	binmode (MAPFILE);
468
469	$unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
470	my $block = $unicode::translations{$encoding};
471
472	my ($in,$i,$j);
473	while (1) {
474	my $ret=read(MAPFILE, $in, 1);
475	if (!defined($ret)) { # error
476	print STDERR "unicode.pm: error reading mapfile: $!\n";
477	last;
478	}
479	if ($ret != 1) { last }
480	$i = unpack ("C", $in);
481	$block->{'map'}->[$i] = [@unicode::array256];
482	for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
483	my ($n1, $n2) = unpack ("CC", $in);
484	$block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
485	}
486	$block->{'count'} ++;
487	}
488
489	close (MAPFILE);
490	}
491
492	# unicode2singlebyte converts unicode to simple 8 bit encodings where
493	# characters below 0x80 are normal ascii characters and the rest are encoded
494	# using the appropriate mapping files.
495	#
496	# Examples of encodings that may be converted using unicode2singlebyte are
497	# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
498	sub unicode2singlebyte {
499	my ($uniref, $encoding) = @_;
500
501	my $outtext = "";
502	my $encodename = "unicode-$encoding";
503
504	if (!exists $encodings::encodings->{$encoding}) {
505	print STDERR "unicode.pm: ERROR - unsupported encoding "
506	. "'$encoding' requested\n";
507	return "";
508	}
509
510	my $enc_info = $encodings::encodings->{$encoding};
511	my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
512	"from_uc", $enc_info->{'mapfile'});
513	if (!&loadmapencoding ($encodename, $mapfile)) {
514	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
515	return "";
516	}
517
518	foreach my $c (@$uniref) {
519	if ($c < 0x80) {
520	# normal ascii character
521	$outtext .= chr($c);
522	} else {
523	# extended ascii character
524	$c = &transchar ($encodename, $c);
525
526	# put a question mark if cannot translate
527	if ($c == 0) {
528	$outtext .= "?";
529	} else {
530	$outtext .= chr($c);
531	}
532	}
533	}
534	return $outtext;
535	}
536
537
538	# this makes sure that the referenced input string is utf8 encoded, and
539	# will change/remove bytes that aren't.
540	# returns 0 if the text was already utf8, or 1 if text modified to become utf8
541	sub ensure_utf8 {
542	my $stringref=shift;
543
544	if (!defined($stringref) \|\| ref($stringref) ne 'SCALAR') {
545	return $stringref;
546	}
547
548	my $value=$$stringref;
549
550	my $non_utf8_found = 0;
551	$value =~ m/^/g; # to set \G
552	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
553	my $highbytes=$1;
554	my $highbyteslength=length($highbytes);
555	# make sure this block of high bytes is utf-8
556	$highbytes =~ /^/g; # set pos()
557	my $byte_replaced = 0;
558	while ($highbytes =~
559	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
560	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
561	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
562	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
563	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
564	)*([\x80-\xff])? !xg
565	) {
566	# this highbyte is "out-of-place" for valid utf-8
567	my $badbyte=$1;
568	if (!defined $badbyte) {next} # hit end of string
569	my $pos=pos($highbytes);
570	# replace bad byte. assume iso-8859-1 -> utf-8
571	# ascii2utf8 does "extended ascii"... ie iso-8859-1
572	my $replacement=&unicode::ascii2utf8(\$badbyte);
573	substr($highbytes, $pos-1, 1, $replacement);
574	# update the position to continue searching (for \G)
575	pos($highbytes) = $pos+length($replacement)-1;
576	$byte_replaced = 1;
577	}
578	if ($byte_replaced) {
579	# replace this block of high bytes in the $value
580	$non_utf8_found = 1;
581	my $replength=length($highbytes); # we've changed the length
582	my $textpos=pos($value); # pos at end of last match
583	# replace bad bytes with good bytes
584	substr($value, $textpos-$highbyteslength,
585	$highbyteslength, $highbytes);
586	# update the position to continue searching (for \G)
587	pos($value)=$textpos+($replength-$highbyteslength)+1;
588	}
589	}
590
591	$$stringref = $value;
592	return $non_utf8_found;
593	}
594
595	# Returns true (1) if the given string is utf8 and false (0) if it isn't.
596	# Does not modify the string parameter.
597	sub check_is_utf8 {
598	my $value=shift;
599
600	if (!defined($value)) {
601	return 0; # not utf8 because it is undefined
602	}
603
604	$value =~ m/^/g; # to set \G
605	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
606	my $highbytes=$1;
607	# make sure this block of high bytes is utf-8
608	$highbytes =~ /^/g; # set pos()
609	while ($highbytes =~
610	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
611	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
612	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
613	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
614	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
615	)*([\x80-\xff])? !xg
616	) {
617	my $badbyte=$1;
618	if (defined $badbyte) { # not end of string
619	return 0; # non-utf8 found
620	}
621	}
622	}
623
624	return 1;
625	}
626
627	sub url_encode {
628	my ($text) = @_;
629
630	if (!&is_url_encoded($text)) {
631	$text =~ s/([^0-9A-Z\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
632	# return the url-encoded character entity for underscore back to the entity
633	$text =~ s/%26%23095%3B/&\#095;/g;
634	}
635	return $text;
636	}
637
638	sub url_decode {
639	my ($text,$and_numeric_entities) = @_;
640
641	if(defined $text) {
642	$text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige;
643
644	if ((defined $and_numeric_entities) && ($and_numeric_entities)) {
645	$text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
646	$text =~ s/\&\#u?([0-9]+);/pack('C', $1)/ige;
647	}
648	}
649
650	return $text;
651	}
652
653	sub url_decode_utf8 {
654	my ($text,$and_numeric_entities) = @_;
655
656	$text =~ s/\%([0-9A-F]{2})/pack('b', hex($1))/ige;
657
658	$text = Encode::decode("utf8",$text);
659
660	return $text;
661	}
662
663	sub is_url_encoded {
664	my ($text) = @_;
665	return ($text =~ m/\%([0-9A-F]{2})/i) \|\| ($text =~ m/\&\#x([0-9A-F]+;)/i) \|\| ($text =~ m/\&\#([0-9]+;)/i);
666	}
667
668	# When a filename on the filesystem is already URL-encoded, the
669	# URL to it will have %25s in place of every % sign, so that
670	# URLs in html pages can refer to the URL-encoded filename.
671	# This method changes the URL reference back into the actual
672	# (URL-encoded) filename on the filesystem by replacing %25 with %.
673	sub url_to_filename {
674	my ($text) =@_;
675	$text =~ s/%25/%/g if &is_url_encoded($text);
676	# DM safing would have replaced underscores with character entity _
677	# in SourceFile meta. Undo any such change to get the filename referred to.
678	$text =~ s/&\#095;/_/g;
679	return $text;
680	}
681
682	# When a filename on the filesystem is already URL-encoded, the
683	# URL to it will have %25s in place of every % sign, so that
684	# URLs in html pages can refer to the URL-encoded filename.
685	# Given a (URL-encoded) filename on the filesystem, this subroutine
686	# returns the URL reference string for it by replacing % with %25.
687	# The output string will be the same as the input string if the input
688	# already contains one or more %25s. This is to prevent processing
689	# a url more than once this way.
690	sub filename_to_url {
691	my ($text) = @_;
692
693	if($text !~ m/%25/) {
694	$text =~ s/%/%25/g;
695	}
696	return $text;
697	}
698
699	sub base64_encode {
700	my ($text) = @_;
701	if(!&conforms_to_mod_base64($text)) {
702	# return entity for underscore to underscore before encoding
703	$text =~ s/&\#095;/_/g;
704
705	$text = &MIME::Base64::encode_base64($text);
706	# base64 encoding may introduce + and / signs,
707	# replacing them with - and _ to ensure it's filename-safe
708	$text =~ s/\+/\-/g; # + -> -
709	$text =~ s/\//\_/g; # / -> _
710	}
711	return $text;
712	}
713
714	# If the input fits the modified base64 pattern, this will try decoding it.
715	# Still, this method does not guarantee the return value is the 'original', only
716	# that the result is where the base64 decoding process has been applied once.
717	# THIS METHOD IS NOT USED at the moment. It's here for convenience and symmetry.
718	sub base64_decode {
719	my ($text) = @_;
720	if(&conforms_to_mod_base64($text)) {
721	# base64 encodes certain chars with + and /, but if we'd encoded it, we'd
722	# have replaced them with - and _ respectively. Undo this before decoding.
723	$text =~ s/\-/\+/g; # - -> +
724	$text =~ s/\_/\//g; # _ -> /
725	$text = &MIME::Base64::decode_base64($text);
726	}
727	return $text;
728	}
729
730	# Returns true if the given string is compatible with a modified version
731	# of base64 (where the + and / are replaced with - and _), a format which
732	# includes also regular ASCII alphanumeric values. This method does not
733	# guarantee that the given string is actually base64 encoded, since it will
734	# return true for any simple alphanumeric ASCII string as well.
735	sub conforms_to_mod_base64 {
736	my ($text) = @_;
737
738	# need to treat the entity ref for underscore as underscore
739	$text =~ s/&\#095;/_/g;
740
741	# base 64 takes alphanumeric and [=+/],
742	# but we use modified base64 where + and / are replaced with - and _
743	return ($text =~ m/^[A-Za-z0-9\=\-\_]+$/); #alphanumeric and [=-_]
744	}
745
746	sub substr
747	{
748	my ($utf8_string, $offset, $length) = @_;
749
750	my @unicode_string = @{&utf82unicode($utf8_string)};
751	my $unicode_string_length = scalar(@unicode_string);
752
753	my $substr_start = $offset;
754	if ($substr_start >= $unicode_string_length) {
755	return "";
756	}
757
758	my $substr_end = $offset + $length - 1;
759	if ($substr_end >= $unicode_string_length) {
760	$substr_end = $unicode_string_length - 1;
761	}
762
763	my @unicode_substring = @unicode_string[$substr_start..$substr_end];
764	return &unicode2utf8(\@unicode_substring);
765	}
766
767	# Useful method to print UTF8 (or other unicode) for debugging.
768	# Characters that are easily displayed (that is, printable ASCII)
769	# are shown as-is, whereas hex values of the unicode code points
770	# are shown for all other chars.
771	sub debug_unicode_string
772	{
773	join("",
774	map { $_ > 127 ? # if wide character...
775	sprintf("\\x{%04X}", $_) : # \x{...}
776	chr($_)
777	} unpack("U*", $_[0])); # unpack Unicode characters
778	}
779
780
781	sub raw_filename_to_url_encoded
782	{
783	my ($str_in) = @_;
784
785	my @url_encoded_chars
786	= map { $_ > 255 ? # Needs to be represent in entity form
787	sprintf("&#x%X;",$_) :
788	$_>127 \|\| $_==ord("%") ? # Representable in %XX form
789	sprintf("%%%2X", $_) :
790	chr($_) # otherwise, Ascii char
791	} unpack("U*", $str_in); # Unpack Unicode characters
792
793
794	my $str_out = join("", @url_encoded_chars);
795
796	return $str_out;
797
798	}
799
800	sub url_encoded_to_raw_filename
801	{
802	my ($str_in) = @_;
803
804	my $str_out = $str_in;
805
806	$str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
807	$str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
808	$str_out =~ s/&#([0-9]+);/chr($1)/eig;
809
810	return $str_out;
811	}
812
813
814	sub raw_filename_to_utf8_url_encoded
815	{
816	my ($str_in) = @_;
817
818	$str_in = Encode::encode("utf8",$str_in) if !check_is_utf8($str_in);
819
820	my @url_encoded_chars
821	= map { $_ > 127 ? # Representable in %XX form
822	sprintf("%%%2X", $_) :
823	chr($_) # otherwise, Ascii char
824	} unpack("U*", $str_in); # Unpack utf8 characters
825
826
827	my $str_out = join("", @url_encoded_chars);
828
829	return $str_out;
830
831	}
832
833	sub utf8_url_encoded_to_raw_filename
834	{
835	my ($str_in) = @_;
836
837	my $utf8_str_out = $str_in;
838
839	$utf8_str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
840
841	my $unicode_str_out = decode("utf8",$utf8_str_out);
842	my $raw_str_out = utf8::downgrade($unicode_str_out);
843
844	return $raw_str_out;
845	}
846
847	sub analyze_raw_string
848	{
849	my ($str_in) = @_;
850
851	my $uses_bytecodes = 0;
852	my $exceeds_bytecodes = 0;
853
854	map { $exceeds_bytecodes = 1 if ($_ >= 256);
855	$uses_bytecodes = 1 if (($_ >= 128) && ($_ < 256));
856	} unpack("U*", $str_in); # Unpack Unicode characters
857
858	return ($uses_bytecodes,$exceeds_bytecodes);
859	}
860
861
862	sub convert_utf8_string_to_unicode_string
863	{
864	my $utf8_string = shift(@_);
865
866	my $unicode_string = "";
867	foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
868	$unicode_string .= chr($unicode_value);
869	}
870	return $unicode_string;
871	}
872
873	sub convert_unicode_string_to_utf8_string
874	{
875	my $unicode_string = shift(@_);
876
877	my @unicode_array;
878	for (my $i = 0; $i < length($unicode_string); $i++) {
879	push(@unicode_array, ord(&substr($unicode_string, $i, 1)));
880	}
881	return &unicode::unicode2utf8(\@unicode_array);
882	}
883
884
885	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: