Context Navigation

source: main/trunk/greenstone2/perllib/unicode.pm@ 23387

Last change on this file since 23387 was 23387, checked in by davidb, 13 years ago
Further changes to deal with documents that use different filename encodings on the file-system. Now sets UTF8URL metadata to perform the cross-document look up. Files stored in doc.pm as associated files are now always raw filenames (rather than potentially UTF8 encoded). Storing of filenames seen by HTMLPlug when scanning for files to block on is now done in Unicode aware strings rather than utf8 but unware strings.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 22.7 KB

Line
1	###########################################################################
2	#
3	# unicode.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2004 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# useful functions for dealing with Unicode
27
28	# Unicode strings are stored as arrays of scalars as perl
29	# lacks characters are 8-bit (currently)
30
31	package unicode;
32
33	eval {require bytes};
34
35	use encodings;
36	use strict;
37	use util;
38	use MIME::Base64; # for base64 encoding
39
40	no strict 'refs';
41
42
43
44	sub utf8decomp
45	{
46	my ($str) = @_;
47
48	return if (!defined $str);
49	return "" if ($str eq "");
50
51	my @unpacked_chars = unpack("C*", $str); # unpack Unicode characters
52
53	my @each_char
54	= map { ($_ > 255 )
55	? # if wide character...
56	sprintf("\\x{%04X}", $_)
57	: # \x{...}
58	(chr($_) =~ m/[[:cntrl:]]/ )
59	? # else if control character ...
60	sprintf("\\x%02X", $_)
61	: # \x..
62	quotemeta(chr($_)) # else quoted or as themselves
63	} @unpacked_chars;
64
65	return join("",@each_char);
66	}
67
68
69	sub hex_codepoint {
70	if (my $char = shift) {
71	return sprintf '%2.2x', unpack('U0U*', $char);
72	}
73	}
74
75
76
77
78	# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
79	# and returns a unicode array.
80	sub ascii2unicode {
81	my ($in) = @_;
82	my $out = [];
83
84	my $i = 0;
85	my $len = length($in);
86	while ($i < $len) {
87	push (@$out, ord(substr ($in, $i, 1)));
88	$i++;
89	}
90
91	return $out;
92	}
93
94	# ascii2utf8 takes a reference to an (extended) ascii string and returns a
95	# UTF-8 encoded string. This is just a faster version of
96	# "&unicode2utf8(&ascii2unicode($str));"
97	# "Extended ascii" really means "iso_8859_1"
98	sub ascii2utf8 {
99	my ($in) = @_;
100	my $out = "";
101
102	if (!defined($in)\|\| !defined($$in)) {
103	return $out;
104	}
105
106	my ($c);
107	my $i = 0;
108	my $len = length($$in);
109	while ($i < $len) {
110	$c = ord (substr ($$in, $i, 1));
111	if ($c < 0x80) {
112	# ascii character
113	$out .= chr ($c);
114
115	} else {
116	# extended ascii character
117	$out .= chr (0xc0 + (($c >> 6) & 0x1f));
118	$out .= chr (0x80 + ($c & 0x3f));
119	}
120	$i++;
121	}
122
123	return $out;
124	}
125
126	# unicode2utf8 takes a unicode array as input and encodes it
127	# using utf-8
128	sub unicode2utf8 {
129	my ($in) = @_;
130	my $out = "";
131
132	foreach my $num (@$in) {
133	next unless defined $num;
134	if ($num < 0x80) {
135	$out .= chr ($num);
136
137	} elsif ($num < 0x800) {
138	$out .= chr (0xc0 + (($num >> 6) & 0x1f));
139	$out .= chr (0x80 + ($num & 0x3f));
140
141	} elsif ($num < 0xFFFF) {
142	$out .= chr (0xe0 + (($num >> 12) & 0xf));
143	$out .= chr (0x80 + (($num >> 6) & 0x3f));
144	$out .= chr (0x80 + ($num & 0x3f));
145
146	} else {
147	# error, don't encode anything
148	die;
149	}
150	}
151	return $out;
152	}
153
154	# utf82unicode takes a utf-8 string and produces a unicode
155	# array
156	sub utf82unicode {
157	my ($in) = @_;
158	my $out = [];
159
160	my $i = 0;
161	my ($c1, $c2, $c3);
162	my $len = length($in);
163	while ($i < $len) {
164	if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
165	# normal ascii character
166	push (@$out, $c1);
167
168	} elsif ($c1 < 0xc0) {
169	# error, was expecting the first byte of an
170	# encoded character. Do nothing.
171
172	} elsif ($c1 < 0xe0 && $i+1 < $len) {
173	# an encoded character with two bytes
174	$c2 = ord (substr ($in, $i+1, 1));
175	if ($c2 >= 0x80 && $c2 < 0xc0) {
176	# everything looks ok
177	push (@$out, ((($c1 & 0x1f) << 6) +
178	($c2 & 0x3f)));
179	$i++; # gobbled an extra byte
180	}
181
182	} elsif ($c1 < 0xf0 && $i+2 < $len) {
183	# an encoded character with three bytes
184	$c2 = ord (substr ($in, $i+1, 1));
185	$c3 = ord (substr ($in, $i+2, 1));
186	if ($c2 >= 0x80 && $c2 < 0xc0 &&
187	$c3 >= 0x80 && $c3 < 0xc0) {
188	# everything looks ok
189	push (@$out, ((($c1 & 0xf) << 12) +
190	(($c2 & 0x3f) << 6) +
191	($c3 & 0x3f)));
192
193	$i += 2; # gobbled an extra two bytes
194	}
195
196	} else {
197	# error, only decode Unicode characters not full UCS.
198	# Do nothing.
199	}
200
201	$i++;
202	}
203
204	return $out;
205	}
206
207	# unicode2ucs2 takes a unicode array and produces a UCS-2
208	# unicode string (every two bytes forms a unicode character)
209	sub unicode2ucs2 {
210	my ($in) = @_;
211	my $out = "";
212
213	foreach my $num (@$in) {
214	$out .= chr (($num & 0xff00) >> 8);
215	$out .= chr ($num & 0xff);
216	}
217
218	return $out;
219	}
220
221	# ucs22unicode takes a UCS-2 string and produces a unicode array
222	sub ucs22unicode {
223	my ($in) = @_;
224	my $out = [];
225
226	my $i = 0;
227	my $len = length ($in);
228	while ($i+1 < $len) {
229	push (@$out, ord (substr($in, $i, 1)) << 8 +
230	ord (substr($in, $i+1, 1)));
231
232	$i ++;
233	}
234
235	return $out;
236	}
237
238	# takes a reference to a string and returns a reference to a unicode array
239	sub convert2unicode {
240	my ($encoding, $textref) = @_;
241
242	if (!defined $encodings::encodings->{$encoding}) {
243	print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
244	return [];
245	}
246
247	my $encodename = "$encoding-unicode";
248	my $enc_info = $encodings::encodings->{$encoding};
249	my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
250	"to_uc", $enc_info->{'mapfile'});
251	if (!&loadmapencoding ($encodename, $mapfile)) {
252	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
253	return [];
254	}
255
256	if (defined $enc_info->{'converter'}) {
257	my $converter = $enc_info->{'converter'};
258	return &$converter ($encodename, $textref);
259	}
260
261	if ($unicode::translations{$encodename}->{'count'} == 1) {
262	return &singlebyte2unicode ($encodename, $textref);
263	} else {
264	return &doublebyte2unicode ($encodename, $textref);
265	}
266	}
267
268	# singlebyte2unicode converts simple 8 bit encodings where characters below
269	# 0x80 are normal ascii characters and the rest are decoded using the
270	# appropriate mapping files.
271	#
272	# Examples of encodings that may be converted using singlebyte2unicode are
273	# the iso-8859 and windows-125* series.
274	sub singlebyte2unicode {
275	my ($encodename, $textref) = @_;
276
277	my @outtext = ();
278	my $len = length($$textref);
279	my ($c);
280	my $i = 0;
281
282	while ($i < $len) {
283	if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
284	# normal ascii character
285	push (@outtext, $c);
286	} else {
287	$c = &transchar ($encodename, $c);
288	# put a black square if cannot translate
289	$c = 0x25A1 if $c == 0;
290	push (@outtext, $c);
291	}
292	$i ++;
293	}
294	return \@outtext;
295	}
296
297	# doublebyte2unicode converts simple two byte encodings where characters
298	# below code point 0x80 are single-byte characters and the rest are
299	# double-byte characters.
300	#
301	# Examples of encodings that may be converted using doublebyte2unicode are
302	# CJK encodings like GB encoded Chinese and UHC Korean.
303	#
304	# Note that no error checking is performed to make sure that the input text
305	# is valid for the given encoding.
306	#
307	# Also, encodings that may contain characters of more than two bytes are
308	# not supported (any EUC encoded text may in theory contain 3-byte
309	# characters but in practice only one and two byte characters are used).
310	sub doublebyte2unicode {
311	my ($encodename, $textref) = @_;
312
313	my @outtext = ();
314	my $len = length($$textref);
315	my ($c1, $c2);
316	my $i = 0;
317
318	while ($i < $len) {
319	if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
320	if ($i+1 < $len) {
321	# double-byte character
322	$c2 = ord(substr($$textref, $i+1, 1));
323	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
324	# put a black square if cannot translate
325	$c = 0x25A1 if $c == 0;
326	push (@outtext, $c);
327	$i += 2;
328
329	} else {
330	# error
331	print STDERR "unicode: ERROR missing second half of double-byte character\n";
332	$i++;
333	}
334
335	} else {
336	# single-byte character
337	push (@outtext, $c1);
338	$i++;
339	}
340	}
341	return \@outtext;
342	}
343
344	# Shift-JIS to unicode
345	# We can't use doublebyte2unicode for Shift-JIS because it uses some
346	# single-byte characters above code point 0x80 (i.e. half-width katakana
347	# characters in the range 0xA1-0xDF)
348	sub shiftjis2unicode {
349	my ($encodename, $textref) = @_;
350
351	my @outtext = ();
352	my $len = length($$textref);
353	my ($c1, $c2);
354	my $i = 0;
355
356	while ($i < $len) {
357	$c1 = ord(substr($$textref, $i, 1));
358
359	if (($c1 >= 0xA1 && $c1 <= 0xDF) \|\| $c1 == 0x5c \|\| $c1 == 0x7E) {
360	# Single-byte half-width katakana character or
361	# JIS Roman yen or overline characters
362	my $c = &transchar ($encodename, $c1);
363	# - put a black square if cannot translate
364	$c = 0x25A1 if $c == 0;
365	push (@outtext, $c);
366	$i++;
367
368	} elsif ($c1 < 0x80) {
369	# ASCII
370	push (@outtext, $c1);
371	$i ++;
372
373	} elsif ($c1 < 0xEF) {
374	if ($i+1 < $len) {
375	$c2 = ord(substr($$textref, $i+1, 1));
376	if (($c2 >= 0x40 && $c2 <= 0x7E) \|\| ($c2 >= 0x80 && $c2 <= 0xFC)) {
377	# Double-byte shift-jis character
378	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
379	# put a black square if cannot translate
380	$c = 0x25A1 if $c == 0;
381	push (@outtext, $c);
382	} else {
383	# error
384	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
385	}
386	$i += 2;
387	} else {
388	# error
389	print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
390	$i ++;
391	}
392	} else {
393	# error
394	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
395	$i ++;
396	}
397	}
398	return \@outtext;
399	}
400
401	sub transchar {
402	my ($encoding, $from) = @_;
403	my $high = ($from / 256) % 256;
404	my $low = $from % 256;
405
406	return 0 unless defined $unicode::translations{$encoding};
407
408	my $block = $unicode::translations{$encoding}->{'map'};
409
410	if (ref ($block->[$high]) ne "ARRAY") {
411	return 0;
412	}
413	return $block->[$high]->[$low];
414	}
415
416	# %translations is of the form:
417	#
418	# encodings{encodingname-encodingname}->{'map'}->blocktranslation
419	# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
420	#
421	# Any of the top translation blocks can point to an undefined
422	# value. This data structure aims to allow fast translation and
423	# efficient storage.
424	%unicode::translations = ();
425
426	# @array256 is used for initialisation, there must be
427	# a better way...
428	# What about this?: @array256 = (0) x 256;
429	@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
430	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
431	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
432	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
433	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
434	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
435	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
436	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
437	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
438	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
439	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
440	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
441	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
442	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
443	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
444	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
445
446	# returns 1 if successful, 0 if unsuccessful
447	sub loadmapencoding {
448	my ($encoding, $mapfile) = @_;
449
450	# check to see if the encoding has already been loaded
451	return 1 if (defined $unicode::translations{$encoding});
452
453	if (! -r $mapfile \|\| -d $mapfile) {
454	return 0;
455	}
456	return 0 unless open (MAPFILE, $mapfile);
457	binmode (MAPFILE);
458
459	$unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
460	my $block = $unicode::translations{$encoding};
461
462	my ($in,$i,$j);
463	while (1) {
464	my $ret=read(MAPFILE, $in, 1);
465	if (!defined($ret)) { # error
466	print STDERR "unicode.pm: error reading mapfile: $!\n";
467	last;
468	}
469	if ($ret != 1) { last }
470	$i = unpack ("C", $in);
471	$block->{'map'}->[$i] = [@unicode::array256];
472	for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
473	my ($n1, $n2) = unpack ("CC", $in);
474	$block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
475	}
476	$block->{'count'} ++;
477	}
478
479	close (MAPFILE);
480	}
481
482	# unicode2singlebyte converts unicode to simple 8 bit encodings where
483	# characters below 0x80 are normal ascii characters and the rest are encoded
484	# using the appropriate mapping files.
485	#
486	# Examples of encodings that may be converted using unicode2singlebyte are
487	# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
488	sub unicode2singlebyte {
489	my ($uniref, $encoding) = @_;
490
491	my $outtext = "";
492	my $encodename = "unicode-$encoding";
493
494	if (!exists $encodings::encodings->{$encoding}) {
495	print STDERR "unicode.pm: ERROR - unsupported encoding "
496	. "'$encoding' requested\n";
497	return "";
498	}
499
500	my $enc_info = $encodings::encodings->{$encoding};
501	my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
502	"from_uc", $enc_info->{'mapfile'});
503	if (!&loadmapencoding ($encodename, $mapfile)) {
504	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
505	return "";
506	}
507
508	foreach my $c (@$uniref) {
509	if ($c < 0x80) {
510	# normal ascii character
511	$outtext .= chr($c);
512	} else {
513	# extended ascii character
514	$c = &transchar ($encodename, $c);
515
516	# put a question mark if cannot translate
517	if ($c == 0) {
518	$outtext .= "?";
519	} else {
520	$outtext .= chr($c);
521	}
522	}
523	}
524	return $outtext;
525	}
526
527
528	# this makes sure that the referenced input string is utf8 encoded, and
529	# will change/remove bytes that aren't.
530	# returns 0 if the text was already utf8, or 1 if text modified to become utf8
531	sub ensure_utf8 {
532	my $stringref=shift;
533
534	if (!defined($stringref) \|\| ref($stringref) ne 'SCALAR') {
535	return $stringref;
536	}
537
538	my $value=$$stringref;
539
540	my $non_utf8_found = 0;
541	$value =~ m/^/g; # to set \G
542	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
543	my $highbytes=$1;
544	my $highbyteslength=length($highbytes);
545	# make sure this block of high bytes is utf-8
546	$highbytes =~ /^/g; # set pos()
547	my $byte_replaced = 0;
548	while ($highbytes =~
549	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
550	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
551	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
552	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
553	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
554	)*([\x80-\xff])? !xg
555	) {
556	# this highbyte is "out-of-place" for valid utf-8
557	my $badbyte=$1;
558	if (!defined $badbyte) {next} # hit end of string
559	my $pos=pos($highbytes);
560	# replace bad byte. assume iso-8859-1 -> utf-8
561	# ascii2utf8 does "extended ascii"... ie iso-8859-1
562	my $replacement=&unicode::ascii2utf8(\$badbyte);
563	substr($highbytes, $pos-1, 1, $replacement);
564	# update the position to continue searching (for \G)
565	pos($highbytes) = $pos+length($replacement)-1;
566	$byte_replaced = 1;
567	}
568	if ($byte_replaced) {
569	# replace this block of high bytes in the $value
570	$non_utf8_found = 1;
571	my $replength=length($highbytes); # we've changed the length
572	my $textpos=pos($value); # pos at end of last match
573	# replace bad bytes with good bytes
574	substr($value, $textpos-$highbyteslength,
575	$highbyteslength, $highbytes);
576	# update the position to continue searching (for \G)
577	pos($value)=$textpos+($replength-$highbyteslength)+1;
578	}
579	}
580
581	$$stringref = $value;
582	return $non_utf8_found;
583	}
584
585	# Returns true (1) if the given string is utf8 and false (0) if it isn't.
586	# Does not modify the string parameter.
587	sub check_is_utf8 {
588	my $value=shift;
589
590	if (!defined($value)) {
591	return 0; # not utf8 because it is undefined
592	}
593
594	$value =~ m/^/g; # to set \G
595	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
596	my $highbytes=$1;
597	# make sure this block of high bytes is utf-8
598	$highbytes =~ /^/g; # set pos()
599	while ($highbytes =~
600	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
601	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
602	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
603	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
604	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
605	)*([\x80-\xff])? !xg
606	) {
607	my $badbyte=$1;
608	if (defined $badbyte) { # not end of string
609	return 0; # non-utf8 found
610	}
611	}
612	}
613
614	return 1;
615	}
616
617	sub url_encode {
618	my ($text) = @_;
619
620	if (!&is_url_encoded($text)) {
621	$text =~ s/([^0-9A-Z\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
622	# return the url-encoded character entity for underscore back to the entity
623	$text =~ s/%26%23095%3B/&\#095;/g;
624	}
625	return $text;
626	}
627
628	sub url_decode {
629	my ($text,$and_numeric_entities) = @_;
630
631	$text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige;
632
633	if ((defined $and_numeric_entities) && ($and_numeric_entities)) {
634	$text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
635	$text =~ s/\&\#([0-9]+);/pack('C', $1)/ige;
636	}
637
638	return $text;
639	}
640
641	sub is_url_encoded {
642	my ($text) = @_;
643	return ($text =~ m/\%([0-9A-F]{2})/i) \|\| ($text =~ m/\&\#x([0-9A-F]+;)/i) \|\| ($text =~ m/\&\#([0-9]+;)/i);
644	}
645
646	# When a filename on the filesystem is already URL-encoded, the
647	# URL to it will have %25s in place of every % sign, so that
648	# URLs in html pages can refer to the URL-encoded filename.
649	# This method changes the URL reference back into the actual
650	# (URL-encoded) filename on the filesystem by replacing %25 with %.
651	sub url_to_filename {
652	my ($text) =@_;
653	$text =~ s/%25/%/g if &is_url_encoded($text);
654	# DM safing would have replaced underscores with character entity _
655	# in SourceFile meta. Undo any such change to get the filename referred to.
656	$text =~ s/&\#095;/_/g;
657	return $text;
658	}
659
660	# When a filename on the filesystem is already URL-encoded, the
661	# URL to it will have %25s in place of every % sign, so that
662	# URLs in html pages can refer to the URL-encoded filename.
663	# Given a (URL-encoded) filename on the filesystem, this subroutine
664	# returns the URL reference string for it by replacing % with %25.
665	# The output string will be the same as the input string if the input
666	# already contains one or more %25s. This is to prevent processing
667	# a url more than once this way.
668	sub filename_to_url {
669	my ($text) = @_;
670
671	if($text !~ m/%25/) {
672	$text =~ s/%/%25/g;
673	}
674	return $text;
675	}
676
677	sub base64_encode {
678	my ($text) = @_;
679	if(!&conforms_to_mod_base64($text)) {
680	# return entity for underscore to underscore before encoding
681	$text =~ s/&\#095;/_/g;
682
683	$text = &MIME::Base64::encode_base64($text);
684	# base64 encoding may introduce + and / signs,
685	# replacing them with - and _ to ensure it's filename-safe
686	$text =~ s/\+/\-/g; # + -> -
687	$text =~ s/\//\_/g; # / -> _
688	}
689	return $text;
690	}
691
692	# If the input fits the modified base64 pattern, this will try decoding it.
693	# Still, this method does not guarantee the return value is the 'original', only
694	# that the result is where the base64 decoding process has been applied once.
695	# THIS METHOD IS NOT USED at the moment. It's here for convenience and symmetry.
696	sub base64_decode {
697	my ($text) = @_;
698	if(&conforms_to_mod_base64($text)) {
699	# base64 encodes certain chars with + and /, but if we'd encoded it, we'd
700	# have replaced them with - and _ respectively. Undo this before decoding.
701	$text =~ s/\-/\+/g; # - -> +
702	$text =~ s/\_/\//g; # _ -> /
703	$text = &MIME::Base64::decode_base64($text);
704	}
705	return $text;
706	}
707
708	# Returns true if the given string is compatible with a modified version
709	# of base64 (where the + and / are replaced with - and _), a format which
710	# includes also regular ASCII alphanumeric values. This method does not
711	# guarantee that the given string is actually base64 encoded, since it will
712	# return true for any simple alphanumeric ASCII string as well.
713	sub conforms_to_mod_base64 {
714	my ($text) = @_;
715
716	# need to treat the entity ref for underscore as underscore
717	$text =~ s/&\#095;/_/g;
718
719	# base 64 takes alphanumeric and [=+/],
720	# but we use modified base64 where + and / are replaced with - and _
721	return ($text =~ m/^[A-Za-z0-9\=\-\_]+$/); #alphanumeric and [=-_]
722	}
723
724	sub substr
725	{
726	my ($utf8_string, $offset, $length) = @_;
727
728	my @unicode_string = @{&utf82unicode($utf8_string)};
729	my $unicode_string_length = scalar(@unicode_string);
730
731	my $substr_start = $offset;
732	if ($substr_start >= $unicode_string_length) {
733	return "";
734	}
735
736	my $substr_end = $offset + $length - 1;
737	if ($substr_end >= $unicode_string_length) {
738	$substr_end = $unicode_string_length - 1;
739	}
740
741	my @unicode_substring = @unicode_string[$substr_start..$substr_end];
742	return &unicode2utf8(\@unicode_substring);
743	}
744
745	# Useful method to print UTF8 (or other unicode) for debugging.
746	# Characters that are easily displayed (that is, printable ASCII)
747	# are shown as-is, whereas hex values of the unicode code points
748	# are shown for all other chars.
749	sub debug_unicode_string
750	{
751	join("",
752	map { $_ > 128 ? # if wide character...
753	sprintf("\\x{%04X}", $_) : # \x{...}
754	chr($_)
755	} unpack("U*", $_[0])); # unpack Unicode characters
756	}
757
758
759	sub raw_filename_to_url_encoded
760	{
761	my ($str_in) = @_;
762
763	my @url_encoded_chars
764	= map { $_ > 255 ? # Needs to be represent in entity form
765	sprintf("&#x%X;",$_) :
766	$_ > 128 ? # Representable in %XX form
767	sprintf("%%%2X", $_) :
768	chr($_) # otherwise, Ascii char
769	} unpack("U*", $str_in); # Unpack Unicode characters
770
771
772	my $str_out = join("", @url_encoded_chars);
773
774	return $str_out;
775
776	}
777
778	sub url_encoded_to_raw_filename
779	{
780	my ($str_in) = @_;
781
782	my $str_out = $str_in;
783
784	$str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
785	$str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
786	$str_out =~ s/&#([0-9]+);/chr($1)/eig;
787
788	return $str_out;
789	}
790
791
792	sub raw_filename_to_utf8_url_encoded
793	{
794	my ($str_in) = @_;
795
796	$str_in = Encode::encode("utf8",$str_in) if !check_is_utf8($str_in);
797
798	my @url_encoded_chars
799	= map { $_ > 128 ? # Representable in %XX form
800	sprintf("%%%2X", $_) :
801	chr($_) # otherwise, Ascii char
802	} unpack("U*", $str_in); # Unpack utf8 characters
803
804
805	my $str_out = join("", @url_encoded_chars);
806
807	return $str_out;
808
809	}
810
811	sub utf8_url_encoded_to_raw_filename
812	{
813	my ($str_in) = @_;
814
815	my $utf8_str_out = $str_in;
816
817	$utf8_str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
818
819	my $unicode_str_out = decode("utf8",$utf8_str_out);
820	my $raw_str_out = utf8::downgrade($unicode_str_out);
821
822	return $raw_str_out;
823	}
824
825	sub analyze_raw_string
826	{
827	my ($str_in) = @_;
828
829	my $uses_bytecodes = 0;
830	my $exceeds_bytecodes = 0;
831
832	map { $exceeds_bytecodes = 1 if ($_ >= 256);
833	$uses_bytecodes = 1 if (($_ >= 128) && ($_ < 256));
834	} unpack("U*", $str_in); # Unpack Unicode characters
835
836	return ($uses_bytecodes,$exceeds_bytecodes);
837	}
838
839
840	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: