Context Navigation

unicode.pm@ 23553

Last change on this file since 23553 was 23514, checked in by davidb, 14 years ago

Two small, but important changes:

1)Javascripts form of escaping puts in a 'u', as in #u1025

2)When URL encoding a filename we need to encode any literal '%' character into its %xx form as well.

Property svn:executable set to *
Property svn:keywords set to Author Date Id Revision

File size: 22.7 KB

Line
1	###########################################################################
2	#
3	# unicode.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2004 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# useful functions for dealing with Unicode
27
28	# Unicode strings are stored as arrays of scalars as perl
29	# lacks characters are 8-bit (currently)
30
31	package unicode;
32
33	eval {require bytes};
34
35	use encodings;
36	use strict;
37	use util;
38	use MIME::Base64; # for base64 encoding
39
40	no strict 'refs';
41
42
43
44	sub utf8decomp
45	{
46	my ($str) = @_;
47
48	return if (!defined $str);
49	return "" if ($str eq "");
50
51	my @unpacked_chars = unpack("C*", $str); # unpack Unicode characters
52
53	my @each_char
54	= map { ($_ > 255 )
55	? # if wide character...
56	sprintf("\\x{%04X}", $_)
57	: # \x{...}
58	(chr($_) =~ m/[[:cntrl:]]/ )
59	? # else if control character ...
60	sprintf("\\x%02X", $_)
61	: # \x..
62	quotemeta(chr($_)) # else quoted or as themselves
63	} @unpacked_chars;
64
65	return join("",@each_char);
66	}
67
68
69	sub hex_codepoint {
70	if (my $char = shift) {
71	return sprintf '%2.2x', unpack('U0U*', $char);
72	}
73	}
74
75
76
77
78	# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
79	# and returns a unicode array.
80	sub ascii2unicode {
81	my ($in) = @_;
82	my $out = [];
83
84	my $i = 0;
85	my $len = length($in);
86	while ($i < $len) {
87	push (@$out, ord(substr ($in, $i, 1)));
88	$i++;
89	}
90
91	return $out;
92	}
93
94	# ascii2utf8 takes a reference to an (extended) ascii string and returns a
95	# UTF-8 encoded string. This is just a faster version of
96	# "&unicode2utf8(&ascii2unicode($str));"
97	# "Extended ascii" really means "iso_8859_1"
98	sub ascii2utf8 {
99	my ($in) = @_;
100	my $out = "";
101
102	if (!defined($in)\|\| !defined($$in)) {
103	return $out;
104	}
105
106	my ($c);
107	my $i = 0;
108	my $len = length($$in);
109	while ($i < $len) {
110	$c = ord (substr ($$in, $i, 1));
111	if ($c < 0x80) {
112	# ascii character
113	$out .= chr ($c);
114
115	} else {
116	# extended ascii character
117	$out .= chr (0xc0 + (($c >> 6) & 0x1f));
118	$out .= chr (0x80 + ($c & 0x3f));
119	}
120	$i++;
121	}
122
123	return $out;
124	}
125
126	# unicode2utf8 takes a unicode array as input and encodes it
127	# using utf-8
128	sub unicode2utf8 {
129	my ($in) = @_;
130	my $out = "";
131
132	foreach my $num (@$in) {
133	next unless defined $num;
134	if ($num < 0x80) {
135	$out .= chr ($num);
136
137	} elsif ($num < 0x800) {
138	$out .= chr (0xc0 + (($num >> 6) & 0x1f));
139	$out .= chr (0x80 + ($num & 0x3f));
140
141	} elsif ($num < 0xFFFF) {
142	$out .= chr (0xe0 + (($num >> 12) & 0xf));
143	$out .= chr (0x80 + (($num >> 6) & 0x3f));
144	$out .= chr (0x80 + ($num & 0x3f));
145
146	} else {
147	# error, don't encode anything
148	die;
149	}
150	}
151	return $out;
152	}
153
154	# utf82unicode takes a utf-8 string and produces a unicode
155	# array
156	sub utf82unicode {
157	my ($in) = @_;
158	my $out = [];
159
160	my $i = 0;
161	my ($c1, $c2, $c3);
162	my $len = length($in);
163	while ($i < $len) {
164	if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
165	# normal ascii character
166	push (@$out, $c1);
167
168	} elsif ($c1 < 0xc0) {
169	# error, was expecting the first byte of an
170	# encoded character. Do nothing.
171
172	} elsif ($c1 < 0xe0 && $i+1 < $len) {
173	# an encoded character with two bytes
174	$c2 = ord (substr ($in, $i+1, 1));
175	if ($c2 >= 0x80 && $c2 < 0xc0) {
176	# everything looks ok
177	push (@$out, ((($c1 & 0x1f) << 6) +
178	($c2 & 0x3f)));
179	$i++; # gobbled an extra byte
180	}
181
182	} elsif ($c1 < 0xf0 && $i+2 < $len) {
183	# an encoded character with three bytes
184	$c2 = ord (substr ($in, $i+1, 1));
185	$c3 = ord (substr ($in, $i+2, 1));
186	if ($c2 >= 0x80 && $c2 < 0xc0 &&
187	$c3 >= 0x80 && $c3 < 0xc0) {
188	# everything looks ok
189	push (@$out, ((($c1 & 0xf) << 12) +
190	(($c2 & 0x3f) << 6) +
191	($c3 & 0x3f)));
192
193	$i += 2; # gobbled an extra two bytes
194	}
195
196	} else {
197	# error, only decode Unicode characters not full UCS.
198	# Do nothing.
199	}
200
201	$i++;
202	}
203
204	return $out;
205	}
206
207	# unicode2ucs2 takes a unicode array and produces a UCS-2
208	# unicode string (every two bytes forms a unicode character)
209	sub unicode2ucs2 {
210	my ($in) = @_;
211	my $out = "";
212
213	foreach my $num (@$in) {
214	$out .= chr (($num & 0xff00) >> 8);
215	$out .= chr ($num & 0xff);
216	}
217
218	return $out;
219	}
220
221	# ucs22unicode takes a UCS-2 string and produces a unicode array
222	sub ucs22unicode {
223	my ($in) = @_;
224	my $out = [];
225
226	my $i = 0;
227	my $len = length ($in);
228	while ($i+1 < $len) {
229	push (@$out, ord (substr($in, $i, 1)) << 8 +
230	ord (substr($in, $i+1, 1)));
231
232	$i ++;
233	}
234
235	return $out;
236	}
237
238	# takes a reference to a string and returns a reference to a unicode array
239	sub convert2unicode {
240	my ($encoding, $textref) = @_;
241
242	if (!defined $encodings::encodings->{$encoding}) {
243	print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
244	return [];
245	}
246
247	my $encodename = "$encoding-unicode";
248	my $enc_info = $encodings::encodings->{$encoding};
249	my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
250	"to_uc", $enc_info->{'mapfile'});
251	if (!&loadmapencoding ($encodename, $mapfile)) {
252	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
253	return [];
254	}
255
256	if (defined $enc_info->{'converter'}) {
257	my $converter = $enc_info->{'converter'};
258	return &$converter ($encodename, $textref);
259	}
260
261	if ($unicode::translations{$encodename}->{'count'} == 1) {
262	return &singlebyte2unicode ($encodename, $textref);
263	} else {
264	return &doublebyte2unicode ($encodename, $textref);
265	}
266	}
267
268	# singlebyte2unicode converts simple 8 bit encodings where characters below
269	# 0x80 are normal ascii characters and the rest are decoded using the
270	# appropriate mapping files.
271	#
272	# Examples of encodings that may be converted using singlebyte2unicode are
273	# the iso-8859 and windows-125* series.
274	sub singlebyte2unicode {
275	my ($encodename, $textref) = @_;
276
277	my @outtext = ();
278	my $len = length($$textref);
279	my ($c);
280	my $i = 0;
281
282	while ($i < $len) {
283	if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
284	# normal ascii character
285	push (@outtext, $c);
286	} else {
287	$c = &transchar ($encodename, $c);
288	# put a black square if cannot translate
289	$c = 0x25A1 if $c == 0;
290	push (@outtext, $c);
291	}
292	$i ++;
293	}
294	return \@outtext;
295	}
296
297	# doublebyte2unicode converts simple two byte encodings where characters
298	# below code point 0x80 are single-byte characters and the rest are
299	# double-byte characters.
300	#
301	# Examples of encodings that may be converted using doublebyte2unicode are
302	# CJK encodings like GB encoded Chinese and UHC Korean.
303	#
304	# Note that no error checking is performed to make sure that the input text
305	# is valid for the given encoding.
306	#
307	# Also, encodings that may contain characters of more than two bytes are
308	# not supported (any EUC encoded text may in theory contain 3-byte
309	# characters but in practice only one and two byte characters are used).
310	sub doublebyte2unicode {
311	my ($encodename, $textref) = @_;
312
313	my @outtext = ();
314	my $len = length($$textref);
315	my ($c1, $c2);
316	my $i = 0;
317
318	while ($i < $len) {
319	if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
320	if ($i+1 < $len) {
321	# double-byte character
322	$c2 = ord(substr($$textref, $i+1, 1));
323	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
324	# put a black square if cannot translate
325	$c = 0x25A1 if $c == 0;
326	push (@outtext, $c);
327	$i += 2;
328
329	} else {
330	# error
331	print STDERR "unicode: ERROR missing second half of double-byte character\n";
332	$i++;
333	}
334
335	} else {
336	# single-byte character
337	push (@outtext, $c1);
338	$i++;
339	}
340	}
341	return \@outtext;
342	}
343
344	# Shift-JIS to unicode
345	# We can't use doublebyte2unicode for Shift-JIS because it uses some
346	# single-byte characters above code point 0x80 (i.e. half-width katakana
347	# characters in the range 0xA1-0xDF)
348	sub shiftjis2unicode {
349	my ($encodename, $textref) = @_;
350
351	my @outtext = ();
352	my $len = length($$textref);
353	my ($c1, $c2);
354	my $i = 0;
355
356	while ($i < $len) {
357	$c1 = ord(substr($$textref, $i, 1));
358
359	if (($c1 >= 0xA1 && $c1 <= 0xDF) \|\| $c1 == 0x5c \|\| $c1 == 0x7E) {
360	# Single-byte half-width katakana character or
361	# JIS Roman yen or overline characters
362	my $c = &transchar ($encodename, $c1);
363	# - put a black square if cannot translate
364	$c = 0x25A1 if $c == 0;
365	push (@outtext, $c);
366	$i++;
367
368	} elsif ($c1 < 0x80) {
369	# ASCII
370	push (@outtext, $c1);
371	$i ++;
372
373	} elsif ($c1 < 0xEF) {
374	if ($i+1 < $len) {
375	$c2 = ord(substr($$textref, $i+1, 1));
376	if (($c2 >= 0x40 && $c2 <= 0x7E) \|\| ($c2 >= 0x80 && $c2 <= 0xFC)) {
377	# Double-byte shift-jis character
378	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
379	# put a black square if cannot translate
380	$c = 0x25A1 if $c == 0;
381	push (@outtext, $c);
382	} else {
383	# error
384	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
385	}
386	$i += 2;
387	} else {
388	# error
389	print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
390	$i ++;
391	}
392	} else {
393	# error
394	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
395	$i ++;
396	}
397	}
398	return \@outtext;
399	}
400
401	sub transchar {
402	my ($encoding, $from) = @_;
403	my $high = ($from / 256) % 256;
404	my $low = $from % 256;
405
406	return 0 unless defined $unicode::translations{$encoding};
407
408	my $block = $unicode::translations{$encoding}->{'map'};
409
410	if (ref ($block->[$high]) ne "ARRAY") {
411	return 0;
412	}
413	return $block->[$high]->[$low];
414	}
415
416	# %translations is of the form:
417	#
418	# encodings{encodingname-encodingname}->{'map'}->blocktranslation
419	# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
420	#
421	# Any of the top translation blocks can point to an undefined
422	# value. This data structure aims to allow fast translation and
423	# efficient storage.
424	%unicode::translations = ();
425
426	# @array256 is used for initialisation, there must be
427	# a better way...
428	# What about this?: @array256 = (0) x 256;
429	@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
430	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
431	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
432	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
433	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
434	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
435	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
436	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
437	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
438	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
439	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
440	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
441	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
442	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
443	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
444	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
445
446	# returns 1 if successful, 0 if unsuccessful
447	sub loadmapencoding {
448	my ($encoding, $mapfile) = @_;
449
450	# check to see if the encoding has already been loaded
451	return 1 if (defined $unicode::translations{$encoding});
452
453	if (! -r $mapfile \|\| -d $mapfile) {
454	return 0;
455	}
456	return 0 unless open (MAPFILE, $mapfile);
457	binmode (MAPFILE);
458
459	$unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
460	my $block = $unicode::translations{$encoding};
461
462	my ($in,$i,$j);
463	while (1) {
464	my $ret=read(MAPFILE, $in, 1);
465	if (!defined($ret)) { # error
466	print STDERR "unicode.pm: error reading mapfile: $!\n";
467	last;
468	}
469	if ($ret != 1) { last }
470	$i = unpack ("C", $in);
471	$block->{'map'}->[$i] = [@unicode::array256];
472	for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
473	my ($n1, $n2) = unpack ("CC", $in);
474	$block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
475	}
476	$block->{'count'} ++;
477	}
478
479	close (MAPFILE);
480	}
481
482	# unicode2singlebyte converts unicode to simple 8 bit encodings where
483	# characters below 0x80 are normal ascii characters and the rest are encoded
484	# using the appropriate mapping files.
485	#
486	# Examples of encodings that may be converted using unicode2singlebyte are
487	# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
488	sub unicode2singlebyte {
489	my ($uniref, $encoding) = @_;
490
491	my $outtext = "";
492	my $encodename = "unicode-$encoding";
493
494	if (!exists $encodings::encodings->{$encoding}) {
495	print STDERR "unicode.pm: ERROR - unsupported encoding "
496	. "'$encoding' requested\n";
497	return "";
498	}
499
500	my $enc_info = $encodings::encodings->{$encoding};
501	my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
502	"from_uc", $enc_info->{'mapfile'});
503	if (!&loadmapencoding ($encodename, $mapfile)) {
504	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
505	return "";
506	}
507
508	foreach my $c (@$uniref) {
509	if ($c < 0x80) {
510	# normal ascii character
511	$outtext .= chr($c);
512	} else {
513	# extended ascii character
514	$c = &transchar ($encodename, $c);
515
516	# put a question mark if cannot translate
517	if ($c == 0) {
518	$outtext .= "?";
519	} else {
520	$outtext .= chr($c);
521	}
522	}
523	}
524	return $outtext;
525	}
526
527
528	# this makes sure that the referenced input string is utf8 encoded, and
529	# will change/remove bytes that aren't.
530	# returns 0 if the text was already utf8, or 1 if text modified to become utf8
531	sub ensure_utf8 {
532	my $stringref=shift;
533
534	if (!defined($stringref) \|\| ref($stringref) ne 'SCALAR') {
535	return $stringref;
536	}
537
538	my $value=$$stringref;
539
540	my $non_utf8_found = 0;
541	$value =~ m/^/g; # to set \G
542	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
543	my $highbytes=$1;
544	my $highbyteslength=length($highbytes);
545	# make sure this block of high bytes is utf-8
546	$highbytes =~ /^/g; # set pos()
547	my $byte_replaced = 0;
548	while ($highbytes =~
549	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
550	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
551	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
552	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
553	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
554	)*([\x80-\xff])? !xg
555	) {
556	# this highbyte is "out-of-place" for valid utf-8
557	my $badbyte=$1;
558	if (!defined $badbyte) {next} # hit end of string
559	my $pos=pos($highbytes);
560	# replace bad byte. assume iso-8859-1 -> utf-8
561	# ascii2utf8 does "extended ascii"... ie iso-8859-1
562	my $replacement=&unicode::ascii2utf8(\$badbyte);
563	substr($highbytes, $pos-1, 1, $replacement);
564	# update the position to continue searching (for \G)
565	pos($highbytes) = $pos+length($replacement)-1;
566	$byte_replaced = 1;
567	}
568	if ($byte_replaced) {
569	# replace this block of high bytes in the $value
570	$non_utf8_found = 1;
571	my $replength=length($highbytes); # we've changed the length
572	my $textpos=pos($value); # pos at end of last match
573	# replace bad bytes with good bytes
574	substr($value, $textpos-$highbyteslength,
575	$highbyteslength, $highbytes);
576	# update the position to continue searching (for \G)
577	pos($value)=$textpos+($replength-$highbyteslength)+1;
578	}
579	}
580
581	$$stringref = $value;
582	return $non_utf8_found;
583	}
584
585	# Returns true (1) if the given string is utf8 and false (0) if it isn't.
586	# Does not modify the string parameter.
587	sub check_is_utf8 {
588	my $value=shift;
589
590	if (!defined($value)) {
591	return 0; # not utf8 because it is undefined
592	}
593
594	$value =~ m/^/g; # to set \G
595	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
596	my $highbytes=$1;
597	# make sure this block of high bytes is utf-8
598	$highbytes =~ /^/g; # set pos()
599	while ($highbytes =~
600	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
601	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
602	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
603	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
604	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
605	)*([\x80-\xff])? !xg
606	) {
607	my $badbyte=$1;
608	if (defined $badbyte) { # not end of string
609	return 0; # non-utf8 found
610	}
611	}
612	}
613
614	return 1;
615	}
616
617	sub url_encode {
618	my ($text) = @_;
619
620	if (!&is_url_encoded($text)) {
621	$text =~ s/([^0-9A-Z\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
622	# return the url-encoded character entity for underscore back to the entity
623	$text =~ s/%26%23095%3B/&\#095;/g;
624	}
625	return $text;
626	}
627
628	sub url_decode {
629	my ($text,$and_numeric_entities) = @_;
630
631	$text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige;
632
633	if ((defined $and_numeric_entities) && ($and_numeric_entities)) {
634	$text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
635	$text =~ s/\&\#u?([0-9]+);/pack('C', $1)/ige;
636	}
637
638	return $text;
639	}
640
641	sub is_url_encoded {
642	my ($text) = @_;
643	return ($text =~ m/\%([0-9A-F]{2})/i) \|\| ($text =~ m/\&\#x([0-9A-F]+;)/i) \|\| ($text =~ m/\&\#([0-9]+;)/i);
644	}
645
646	# When a filename on the filesystem is already URL-encoded, the
647	# URL to it will have %25s in place of every % sign, so that
648	# URLs in html pages can refer to the URL-encoded filename.
649	# This method changes the URL reference back into the actual
650	# (URL-encoded) filename on the filesystem by replacing %25 with %.
651	sub url_to_filename {
652	my ($text) =@_;
653	$text =~ s/%25/%/g if &is_url_encoded($text);
654	# DM safing would have replaced underscores with character entity _
655	# in SourceFile meta. Undo any such change to get the filename referred to.
656	$text =~ s/&\#095;/_/g;
657	return $text;
658	}
659
660	# When a filename on the filesystem is already URL-encoded, the
661	# URL to it will have %25s in place of every % sign, so that
662	# URLs in html pages can refer to the URL-encoded filename.
663	# Given a (URL-encoded) filename on the filesystem, this subroutine
664	# returns the URL reference string for it by replacing % with %25.
665	# The output string will be the same as the input string if the input
666	# already contains one or more %25s. This is to prevent processing
667	# a url more than once this way.
668	sub filename_to_url {
669	my ($text) = @_;
670
671	if($text !~ m/%25/) {
672	$text =~ s/%/%25/g;
673	}
674	return $text;
675	}
676
677	sub base64_encode {
678	my ($text) = @_;
679	if(!&conforms_to_mod_base64($text)) {
680	# return entity for underscore to underscore before encoding
681	$text =~ s/&\#095;/_/g;
682
683	$text = &MIME::Base64::encode_base64($text);
684	# base64 encoding may introduce + and / signs,
685	# replacing them with - and _ to ensure it's filename-safe
686	$text =~ s/\+/\-/g; # + -> -
687	$text =~ s/\//\_/g; # / -> _
688	}
689	return $text;
690	}
691
692	# If the input fits the modified base64 pattern, this will try decoding it.
693	# Still, this method does not guarantee the return value is the 'original', only
694	# that the result is where the base64 decoding process has been applied once.
695	# THIS METHOD IS NOT USED at the moment. It's here for convenience and symmetry.
696	sub base64_decode {
697	my ($text) = @_;
698	if(&conforms_to_mod_base64($text)) {
699	# base64 encodes certain chars with + and /, but if we'd encoded it, we'd
700	# have replaced them with - and _ respectively. Undo this before decoding.
701	$text =~ s/\-/\+/g; # - -> +
702	$text =~ s/\_/\//g; # _ -> /
703	$text = &MIME::Base64::decode_base64($text);
704	}
705	return $text;
706	}
707
708	# Returns true if the given string is compatible with a modified version
709	# of base64 (where the + and / are replaced with - and _), a format which
710	# includes also regular ASCII alphanumeric values. This method does not
711	# guarantee that the given string is actually base64 encoded, since it will
712	# return true for any simple alphanumeric ASCII string as well.
713	sub conforms_to_mod_base64 {
714	my ($text) = @_;
715
716	# need to treat the entity ref for underscore as underscore
717	$text =~ s/&\#095;/_/g;
718
719	# base 64 takes alphanumeric and [=+/],
720	# but we use modified base64 where + and / are replaced with - and _
721	return ($text =~ m/^[A-Za-z0-9\=\-\_]+$/); #alphanumeric and [=-_]
722	}
723
724	sub substr
725	{
726	my ($utf8_string, $offset, $length) = @_;
727
728	my @unicode_string = @{&utf82unicode($utf8_string)};
729	my $unicode_string_length = scalar(@unicode_string);
730
731	my $substr_start = $offset;
732	if ($substr_start >= $unicode_string_length) {
733	return "";
734	}
735
736	my $substr_end = $offset + $length - 1;
737	if ($substr_end >= $unicode_string_length) {
738	$substr_end = $unicode_string_length - 1;
739	}
740
741	my @unicode_substring = @unicode_string[$substr_start..$substr_end];
742	return &unicode2utf8(\@unicode_substring);
743	}
744
745	# Useful method to print UTF8 (or other unicode) for debugging.
746	# Characters that are easily displayed (that is, printable ASCII)
747	# are shown as-is, whereas hex values of the unicode code points
748	# are shown for all other chars.
749	sub debug_unicode_string
750	{
751	join("",
752	map { $_ > 128 ? # if wide character...
753	sprintf("\\x{%04X}", $_) : # \x{...}
754	chr($_)
755	} unpack("U*", $_[0])); # unpack Unicode characters
756	}
757
758
759	sub raw_filename_to_url_encoded
760	{
761	my ($str_in) = @_;
762
763	my @url_encoded_chars
764	= map { $_ > 255 ? # Needs to be represent in entity form
765	sprintf("&#x%X;",$_) :
766	$_>128 \|\| $_==ord("%") ? # Representable in %XX form
767	sprintf("%%%2X", $_) :
768	chr($_) # otherwise, Ascii char
769	} unpack("U*", $str_in); # Unpack Unicode characters
770
771
772	my $str_out = join("", @url_encoded_chars);
773
774	return $str_out;
775
776	}
777
778	sub url_encoded_to_raw_filename
779	{
780	my ($str_in) = @_;
781
782	my $str_out = $str_in;
783
784	$str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
785	$str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
786	$str_out =~ s/&#([0-9]+);/chr($1)/eig;
787
788	return $str_out;
789	}
790
791
792	sub raw_filename_to_utf8_url_encoded
793	{
794	my ($str_in) = @_;
795
796	$str_in = Encode::encode("utf8",$str_in) if !check_is_utf8($str_in);
797
798	my @url_encoded_chars
799	= map { $_ > 128 ? # Representable in %XX form
800	sprintf("%%%2X", $_) :
801	chr($_) # otherwise, Ascii char
802	} unpack("U*", $str_in); # Unpack utf8 characters
803
804
805	my $str_out = join("", @url_encoded_chars);
806
807	return $str_out;
808
809	}
810
811	sub utf8_url_encoded_to_raw_filename
812	{
813	my ($str_in) = @_;
814
815	my $utf8_str_out = $str_in;
816
817	$utf8_str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
818
819	my $unicode_str_out = decode("utf8",$utf8_str_out);
820	my $raw_str_out = utf8::downgrade($unicode_str_out);
821
822	return $raw_str_out;
823	}
824
825	sub analyze_raw_string
826	{
827	my ($str_in) = @_;
828
829	my $uses_bytecodes = 0;
830	my $exceeds_bytecodes = 0;
831
832	map { $exceeds_bytecodes = 1 if ($_ >= 256);
833	$uses_bytecodes = 1 if (($_ >= 128) && ($_ < 256));
834	} unpack("U*", $str_in); # Unpack Unicode characters
835
836	return ($uses_bytecodes,$exceeds_bytecodes);
837	}
838
839
840	1;

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/perllib/unicode.pm@ 23553

Download in other formats: