Context Navigation

source: main/trunk/greenstone2/perllib/unicode.pm@ 27353

Last change on this file since 27353 was 27333, checked in by ak19, 11 years ago
Added the metadataaction::get_import_metadata_array() subroutine for completion and tested it. 2. A couple of unicode subroutines print warnings when parameters are undefined. Since they can be undefined sometimes, added a test for this so as to avoid the unnecessary warnings.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 23.5 KB

Rev	Line
[537]	1	###########################################################################
	2	#
	3	# unicode.pm --
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
[7798]	8	# Copyright (C) 1999-2004 New Zealand Digital Library Project
[537]	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
[71]	26	# useful functions for dealing with Unicode
	27
	28	# Unicode strings are stored as arrays of scalars as perl
	29	# lacks characters are 8-bit (currently)
	30
	31	package unicode;
[3767]	32
[3834]	33	eval {require bytes};
[3767]	34
[1870]	35	use encodings;
[15894]	36	use strict;
[2713]	37	use util;
[27306]	38	use FileUtils;
[18342]	39	use MIME::Base64; # for base64 encoding
[71]	40
[18245]	41	no strict 'refs';
	42
[23304]	43
	44
	45	sub utf8decomp
	46	{
	47	my ($str) = @_;
	48
	49	return if (!defined $str);
	50	return "" if ($str eq "");
	51
	52	my @unpacked_chars = unpack("C*", $str); # unpack Unicode characters
	53
	54	my @each_char
	55	= map { ($_ > 255 )
	56	? # if wide character...
	57	sprintf("\\x{%04X}", $_)
	58	: # \x{...}
	59	(chr($_) =~ m/[[:cntrl:]]/ )
	60	? # else if control character ...
	61	sprintf("\\x%02X", $_)
	62	: # \x..
	63	quotemeta(chr($_)) # else quoted or as themselves
	64	} @unpacked_chars;
	65
	66	return join("",@each_char);
	67	}
	68
	69
	70	sub hex_codepoint {
	71	if (my $char = shift) {
	72	return sprintf '%2.2x', unpack('U0U*', $char);
	73	}
	74	}
	75
	76
	77
	78
[1223]	79	# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
	80	# and returns a unicode array.
[73]	81	sub ascii2unicode {
	82	my ($in) = @_;
	83	my $out = [];
	84
	85	my $i = 0;
	86	my $len = length($in);
	87	while ($i < $len) {
	88	push (@$out, ord(substr ($in, $i, 1)));
	89	$i++;
	90	}
	91
	92	return $out;
	93	}
	94
[1870]	95	# ascii2utf8 takes a reference to an (extended) ascii string and returns a
	96	# UTF-8 encoded string. This is just a faster version of
	97	# "&unicode2utf8(&ascii2unicode($str));"
[7815]	98	# "Extended ascii" really means "iso_8859_1"
[73]	99	sub ascii2utf8 {
	100	my ($in) = @_;
	101	my $out = "";
	102
[2483]	103	if (!defined($in)\|\| !defined($$in)) {
	104	return $out;
	105	}
	106
[73]	107	my ($c);
	108	my $i = 0;
[1870]	109	my $len = length($$in);
[73]	110	while ($i < $len) {
[1870]	111	$c = ord (substr ($$in, $i, 1));
[73]	112	if ($c < 0x80) {
	113	# ascii character
	114	$out .= chr ($c);
	115
	116	} else {
	117	# extended ascii character
	118	$out .= chr (0xc0 + (($c >> 6) & 0x1f));
	119	$out .= chr (0x80 + ($c & 0x3f));
	120	}
	121	$i++;
	122	}
	123
	124	return $out;
	125	}
	126
[71]	127	# unicode2utf8 takes a unicode array as input and encodes it
	128	# using utf-8
	129	sub unicode2utf8 {
	130	my ($in) = @_;
	131	my $out = "";
	132
[8716]	133	foreach my $num (@$in) {
[1868]	134	next unless defined $num;
[71]	135	if ($num < 0x80) {
	136	$out .= chr ($num);
	137
	138	} elsif ($num < 0x800) {
	139	$out .= chr (0xc0 + (($num >> 6) & 0x1f));
	140	$out .= chr (0x80 + ($num & 0x3f));
	141
	142	} elsif ($num < 0xFFFF) {
	143	$out .= chr (0xe0 + (($num >> 12) & 0xf));
	144	$out .= chr (0x80 + (($num >> 6) & 0x3f));
	145	$out .= chr (0x80 + ($num & 0x3f));
	146
	147	} else {
	148	# error, don't encode anything
	149	die;
	150	}
	151	}
	152	return $out;
	153	}
	154
	155	# utf82unicode takes a utf-8 string and produces a unicode
	156	# array
	157	sub utf82unicode {
	158	my ($in) = @_;
	159	my $out = [];
	160
[27333]	161	if(!defined $in) {
	162	return $out;
	163	}
	164
[71]	165	my $i = 0;
	166	my ($c1, $c2, $c3);
[8716]	167	my $len = length($in);
[71]	168	while ($i < $len) {
	169	if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
	170	# normal ascii character
	171	push (@$out, $c1);
	172
	173	} elsif ($c1 < 0xc0) {
	174	# error, was expecting the first byte of an
	175	# encoded character. Do nothing.
	176
	177	} elsif ($c1 < 0xe0 && $i+1 < $len) {
	178	# an encoded character with two bytes
	179	$c2 = ord (substr ($in, $i+1, 1));
	180	if ($c2 >= 0x80 && $c2 < 0xc0) {
	181	# everything looks ok
	182	push (@$out, ((($c1 & 0x1f) << 6) +
	183	($c2 & 0x3f)));
	184	$i++; # gobbled an extra byte
	185	}
	186
	187	} elsif ($c1 < 0xf0 && $i+2 < $len) {
	188	# an encoded character with three bytes
	189	$c2 = ord (substr ($in, $i+1, 1));
	190	$c3 = ord (substr ($in, $i+2, 1));
	191	if ($c2 >= 0x80 && $c2 < 0xc0 &&
	192	$c3 >= 0x80 && $c3 < 0xc0) {
	193	# everything looks ok
	194	push (@$out, ((($c1 & 0xf) << 12) +
	195	(($c2 & 0x3f) << 6) +
	196	($c3 & 0x3f)));
	197
	198	$i += 2; # gobbled an extra two bytes
	199	}
	200
	201	} else {
	202	# error, only decode Unicode characters not full UCS.
	203	# Do nothing.
	204	}
	205
	206	$i++;
	207	}
	208
	209	return $out;
	210	}
	211
	212	# unicode2ucs2 takes a unicode array and produces a UCS-2
	213	# unicode string (every two bytes forms a unicode character)
	214	sub unicode2ucs2 {
	215	my ($in) = @_;
	216	my $out = "";
	217
[8716]	218	foreach my $num (@$in) {
[71]	219	$out .= chr (($num & 0xff00) >> 8);
	220	$out .= chr ($num & 0xff);
	221	}
	222
	223	return $out;
	224	}
	225
	226	# ucs22unicode takes a UCS-2 string and produces a unicode array
	227	sub ucs22unicode {
	228	my ($in) = @_;
	229	my $out = [];
	230
	231	my $i = 0;
	232	my $len = length ($in);
	233	while ($i+1 < $len) {
	234	push (@$out, ord (substr($in, $i, 1)) << 8 +
	235	ord (substr($in, $i+1, 1)));
	236
	237	$i ++;
	238	}
	239
	240	return $out;
	241	}
	242
[1870]	243	# takes a reference to a string and returns a reference to a unicode array
	244	sub convert2unicode {
	245	my ($encoding, $textref) = @_;
[71]	246
[1870]	247	if (!defined $encodings::encodings->{$encoding}) {
	248	print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
	249	return [];
[1227]	250	}
	251
[1870]	252	my $encodename = "$encoding-unicode";
	253	my $enc_info = $encodings::encodings->{$encoding};
[27306]	254	my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
[1870]	255	"to_uc", $enc_info->{'mapfile'});
	256	if (!&loadmapencoding ($encodename, $mapfile)) {
[10983]	257	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
[1870]	258	return [];
[1227]	259	}
[1870]	260
	261	if (defined $enc_info->{'converter'}) {
	262	my $converter = $enc_info->{'converter'};
	263	return &$converter ($encodename, $textref);
	264	}
[1227]	265
[8716]	266	if ($unicode::translations{$encodename}->{'count'} == 1) {
[1870]	267	return &singlebyte2unicode ($encodename, $textref);
	268	} else {
	269	return &doublebyte2unicode ($encodename, $textref);
	270	}
	271	}
[1227]	272
[1870]	273	# singlebyte2unicode converts simple 8 bit encodings where characters below
	274	# 0x80 are normal ascii characters and the rest are decoded using the
	275	# appropriate mapping files.
	276	#
	277	# Examples of encodings that may be converted using singlebyte2unicode are
[4229]	278	# the iso-8859 and windows-125* series.
[1870]	279	sub singlebyte2unicode {
	280	my ($encodename, $textref) = @_;
[1227]	281
[1870]	282	my @outtext = ();
	283	my $len = length($$textref);
	284	my ($c);
	285	my $i = 0;
[1227]	286
[1870]	287	while ($i < $len) {
	288	if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
	289	# normal ascii character
	290	push (@outtext, $c);
	291	} else {
	292	$c = &transchar ($encodename, $c);
	293	# put a black square if cannot translate
	294	$c = 0x25A1 if $c == 0;
	295	push (@outtext, $c);
	296	}
	297	$i ++;
[1227]	298	}
[1870]	299	return \@outtext;
	300	}
[1227]	301
[1870]	302	# doublebyte2unicode converts simple two byte encodings where characters
	303	# below code point 0x80 are single-byte characters and the rest are
	304	# double-byte characters.
	305	#
	306	# Examples of encodings that may be converted using doublebyte2unicode are
	307	# CJK encodings like GB encoded Chinese and UHC Korean.
	308	#
	309	# Note that no error checking is performed to make sure that the input text
	310	# is valid for the given encoding.
	311	#
	312	# Also, encodings that may contain characters of more than two bytes are
	313	# not supported (any EUC encoded text may in theory contain 3-byte
	314	# characters but in practice only one and two byte characters are used).
	315	sub doublebyte2unicode {
	316	my ($encodename, $textref) = @_;
	317
	318	my @outtext = ();
	319	my $len = length($$textref);
	320	my ($c1, $c2);
	321	my $i = 0;
[1227]	322
[1870]	323	while ($i < $len) {
	324	if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
	325	if ($i+1 < $len) {
	326	# double-byte character
	327	$c2 = ord(substr($$textref, $i+1, 1));
	328	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
	329	# put a black square if cannot translate
	330	$c = 0x25A1 if $c == 0;
	331	push (@outtext, $c);
	332	$i += 2;
	333
	334	} else {
	335	# error
	336	print STDERR "unicode: ERROR missing second half of double-byte character\n";
	337	$i++;
	338	}
	339
	340	} else {
	341	# single-byte character
	342	push (@outtext, $c1);
	343	$i++;
	344	}
	345	}
	346	return \@outtext;
[1227]	347	}
	348
[1870]	349	# Shift-JIS to unicode
	350	# We can't use doublebyte2unicode for Shift-JIS because it uses some
	351	# single-byte characters above code point 0x80 (i.e. half-width katakana
	352	# characters in the range 0xA1-0xDF)
	353	sub shiftjis2unicode {
	354	my ($encodename, $textref) = @_;
	355
	356	my @outtext = ();
	357	my $len = length($$textref);
	358	my ($c1, $c2);
	359	my $i = 0;
[1227]	360
[1870]	361	while ($i < $len) {
	362	$c1 = ord(substr($$textref, $i, 1));
[1868]	363
[1870]	364	if (($c1 >= 0xA1 && $c1 <= 0xDF) \|\| $c1 == 0x5c \|\| $c1 == 0x7E) {
	365	# Single-byte half-width katakana character or
	366	# JIS Roman yen or overline characters
	367	my $c = &transchar ($encodename, $c1);
	368	# - put a black square if cannot translate
	369	$c = 0x25A1 if $c == 0;
	370	push (@outtext, $c);
	371	$i++;
[1868]	372
[1870]	373	} elsif ($c1 < 0x80) {
	374	# ASCII
	375	push (@outtext, $c1);
	376	$i ++;
[1868]	377
[1870]	378	} elsif ($c1 < 0xEF) {
	379	if ($i+1 < $len) {
	380	$c2 = ord(substr($$textref, $i+1, 1));
	381	if (($c2 >= 0x40 && $c2 <= 0x7E) \|\| ($c2 >= 0x80 && $c2 <= 0xFC)) {
	382	# Double-byte shift-jis character
	383	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
	384	# put a black square if cannot translate
	385	$c = 0x25A1 if $c == 0;
	386	push (@outtext, $c);
	387	} else {
	388	# error
	389	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
	390	}
	391	$i += 2;
	392	} else {
	393	# error
	394	print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
	395	$i ++;
	396	}
	397	} else {
	398	# error
	399	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
	400	$i ++;
	401	}
	402	}
	403	return \@outtext;
	404	}
[1868]	405
[1870]	406	sub transchar {
	407	my ($encoding, $from) = @_;
	408	my $high = ($from / 256) % 256;
	409	my $low = $from % 256;
[1868]	410
[8716]	411	return 0 unless defined $unicode::translations{$encoding};
[1868]	412
[8716]	413	my $block = $unicode::translations{$encoding}->{'map'};
[1868]	414
[1870]	415	if (ref ($block->[$high]) ne "ARRAY") {
	416	return 0;
	417	}
	418	return $block->[$high]->[$low];
	419	}
[1868]	420
	421	# %translations is of the form:
	422	#
[1870]	423	# encodings{encodingname-encodingname}->{'map'}->blocktranslation
[1868]	424	# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
	425	#
	426	# Any of the top translation blocks can point to an undefined
	427	# value. This data structure aims to allow fast translation and
	428	# efficient storage.
[8716]	429	%unicode::translations = ();
[1868]	430
	431	# @array256 is used for initialisation, there must be
	432	# a better way...
[4229]	433	# What about this?: @array256 = (0) x 256;
[8716]	434	@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
[1868]	435	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	436	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	437	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	438	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	439	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	440	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	441	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	442	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	443	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	444	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	445	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	446	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	447	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	448	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	449	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
	450
	451	# returns 1 if successful, 0 if unsuccessful
	452	sub loadmapencoding {
	453	my ($encoding, $mapfile) = @_;
	454
	455	# check to see if the encoding has already been loaded
[8716]	456	return 1 if (defined $unicode::translations{$encoding});
[1868]	457
[9967]	458	if (! -r $mapfile \|\| -d $mapfile) {
	459	return 0;
	460	}
[1868]	461	return 0 unless open (MAPFILE, $mapfile);
	462	binmode (MAPFILE);
	463
[8716]	464	$unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
	465	my $block = $unicode::translations{$encoding};
[1868]	466
	467	my ($in,$i,$j);
[9967]	468	while (1) {
	469	my $ret=read(MAPFILE, $in, 1);
	470	if (!defined($ret)) { # error
	471	print STDERR "unicode.pm: error reading mapfile: $!\n";
	472	last;
	473	}
	474	if ($ret != 1) { last }
[1868]	475	$i = unpack ("C", $in);
[8716]	476	$block->{'map'}->[$i] = [@unicode::array256];
[1868]	477	for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
	478	my ($n1, $n2) = unpack ("CC", $in);
[1870]	479	$block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
[1868]	480	}
[1870]	481	$block->{'count'} ++;
[1868]	482	}
	483
	484	close (MAPFILE);
	485	}
	486
[4229]	487	# unicode2singlebyte converts unicode to simple 8 bit encodings where
	488	# characters below 0x80 are normal ascii characters and the rest are encoded
	489	# using the appropriate mapping files.
	490	#
	491	# Examples of encodings that may be converted using unicode2singlebyte are
	492	# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
	493	sub unicode2singlebyte {
	494	my ($uniref, $encoding) = @_;
	495
[2237]	496	my $outtext = "";
[4229]	497	my $encodename = "unicode-$encoding";
[9967]	498
	499	if (!exists $encodings::encodings->{$encoding}) {
	500	print STDERR "unicode.pm: ERROR - unsupported encoding "
	501	. "'$encoding' requested\n";
	502	return "";
	503	}
	504
	505	my $enc_info = $encodings::encodings->{$encoding};
[27306]	506	my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
[2237]	507	"from_uc", $enc_info->{'mapfile'});
	508	if (!&loadmapencoding ($encodename, $mapfile)) {
[10983]	509	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
[2237]	510	return "";
	511	}
	512
	513	foreach my $c (@$uniref) {
	514	if ($c < 0x80) {
	515	# normal ascii character
	516	$outtext .= chr($c);
	517	} else {
	518	# extended ascii character
	519	$c = &transchar ($encodename, $c);
	520
	521	# put a question mark if cannot translate
	522	if ($c == 0) {
	523	$outtext .= "?";
	524	} else {
	525	$outtext .= chr($c);
	526	}
	527	}
	528	}
	529	return $outtext;
	530	}
	531
[7798]	532
	533	# this makes sure that the referenced input string is utf8 encoded, and
	534	# will change/remove bytes that aren't.
	535	# returns 0 if the text was already utf8, or 1 if text modified to become utf8
	536	sub ensure_utf8 {
	537	my $stringref=shift;
	538
[8217]	539	if (!defined($stringref) \|\| ref($stringref) ne 'SCALAR') {
	540	return $stringref;
	541	}
	542
[7798]	543	my $value=$$stringref;
	544
	545	my $non_utf8_found = 0;
	546	$value =~ m/^/g; # to set \G
	547	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
	548	my $highbytes=$1;
	549	my $highbyteslength=length($highbytes);
	550	# make sure this block of high bytes is utf-8
	551	$highbytes =~ /^/g; # set pos()
	552	my $byte_replaced = 0;
	553	while ($highbytes =~
[8903]	554	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
[7798]	555	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
[8903]	556	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
	557	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
	558	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
[7798]	559	)*([\x80-\xff])? !xg
	560	) {
	561	# this highbyte is "out-of-place" for valid utf-8
	562	my $badbyte=$1;
	563	if (!defined $badbyte) {next} # hit end of string
	564	my $pos=pos($highbytes);
	565	# replace bad byte. assume iso-8859-1 -> utf-8
	566	# ascii2utf8 does "extended ascii"... ie iso-8859-1
	567	my $replacement=&unicode::ascii2utf8(\$badbyte);
	568	substr($highbytes, $pos-1, 1, $replacement);
	569	# update the position to continue searching (for \G)
	570	pos($highbytes) = $pos+length($replacement)-1;
	571	$byte_replaced = 1;
	572	}
	573	if ($byte_replaced) {
	574	# replace this block of high bytes in the $value
	575	$non_utf8_found = 1;
	576	my $replength=length($highbytes); # we've changed the length
	577	my $textpos=pos($value); # pos at end of last match
	578	# replace bad bytes with good bytes
	579	substr($value, $textpos-$highbyteslength,
	580	$highbyteslength, $highbytes);
	581	# update the position to continue searching (for \G)
	582	pos($value)=$textpos+($replength-$highbyteslength)+1;
	583	}
	584	}
	585
[7817]	586	$$stringref = $value;
[7798]	587	return $non_utf8_found;
	588	}
	589
[16553]	590	# Returns true (1) if the given string is utf8 and false (0) if it isn't.
	591	# Does not modify the string parameter.
	592	sub check_is_utf8 {
	593	my $value=shift;
	594
	595	if (!defined($value)) {
	596	return 0; # not utf8 because it is undefined
	597	}
	598
	599	$value =~ m/^/g; # to set \G
	600	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
	601	my $highbytes=$1;
	602	# make sure this block of high bytes is utf-8
	603	$highbytes =~ /^/g; # set pos()
	604	while ($highbytes =~
	605	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
	606	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
	607	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
	608	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
	609	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
	610	)*([\x80-\xff])? !xg
	611	) {
	612	my $badbyte=$1;
	613	if (defined $badbyte) { # not end of string
	614	return 0; # non-utf8 found
	615	}
	616	}
	617	}
	618
	619	return 1;
	620	}
	621
[16672]	622	sub url_encode {
	623	my ($text) = @_;
[18338]	624
[16900]	625	if (!&is_url_encoded($text)) {
[23362]	626	$text =~ s/([^0-9A-Z\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
[18410]	627	# return the url-encoded character entity for underscore back to the entity
	628	$text =~ s/%26%23095%3B/&\#095;/g;
[16900]	629	}
[16672]	630	return $text;
	631	}
	632
	633	sub url_decode {
[23387]	634	my ($text,$and_numeric_entities) = @_;
[16672]	635
[27333]	636	if(defined $text) {
	637	$text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige;
	638
	639	if ((defined $and_numeric_entities) && ($and_numeric_entities)) {
	640	$text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
	641	$text =~ s/\&\#u?([0-9]+);/pack('C', $1)/ige;
	642	}
[23387]	643	}
	644
[16672]	645	return $text;
	646	}
	647
[23750]	648	sub url_decode_utf8 {
	649	my ($text,$and_numeric_entities) = @_;
	650
	651	$text =~ s/\%([0-9A-F]{2})/pack('b', hex($1))/ige;
	652
	653	$text = Encode::decode("utf8",$text);
	654
	655	return $text;
	656	}
	657
[16900]	658	sub is_url_encoded {
	659	my ($text) = @_;
[23362]	660	return ($text =~ m/\%([0-9A-F]{2})/i) \|\| ($text =~ m/\&\#x([0-9A-F]+;)/i) \|\| ($text =~ m/\&\#([0-9]+;)/i);
[16900]	661	}
	662
[16918]	663	# When a filename on the filesystem is already URL-encoded, the
[18319]	664	# URL to it will have %25s in place of every % sign, so that
[16918]	665	# URLs in html pages can refer to the URL-encoded filename.
	666	# This method changes the URL reference back into the actual
	667	# (URL-encoded) filename on the filesystem by replacing %25 with %.
	668	sub url_to_filename {
	669	my ($text) =@_;
[18379]	670	$text =~ s/%25/%/g if &is_url_encoded($text);
[18410]	671	# DM safing would have replaced underscores with character entity _
	672	# in SourceFile meta. Undo any such change to get the filename referred to.
	673	$text =~ s/&\#095;/_/g;
[16933]	674	return $text;
[16918]	675	}
	676
	677	# When a filename on the filesystem is already URL-encoded, the
[18410]	678	# URL to it will have %25s in place of every % sign, so that
[16918]	679	# URLs in html pages can refer to the URL-encoded filename.
	680	# Given a (URL-encoded) filename on the filesystem, this subroutine
	681	# returns the URL reference string for it by replacing % with %25.
	682	# The output string will be the same as the input string if the input
	683	# already contains one or more %25s. This is to prevent processing
	684	# a url more than once this way.
	685	sub filename_to_url {
[18404]	686	my ($text) = @_;
[16918]	687
[18404]	688	if($text !~ m/%25/) {
	689	$text =~ s/%/%25/g;
[16918]	690	}
[16933]	691	return $text;
[16918]	692	}
	693
[18342]	694	sub base64_encode {
	695	my ($text) = @_;
[18379]	696	if(!&conforms_to_mod_base64($text)) {
[18410]	697	# return entity for underscore to underscore before encoding
	698	$text =~ s/&\#095;/_/g;
	699
[18342]	700	$text = &MIME::Base64::encode_base64($text);
[18379]	701	# base64 encoding may introduce + and / signs,
	702	# replacing them with - and _ to ensure it's filename-safe
	703	$text =~ s/\+/\-/g; # + -> -
	704	$text =~ s/\//\_/g; # / -> _
[18342]	705	}
	706	return $text;
	707	}
[16918]	708
[18379]	709	# If the input fits the modified base64 pattern, this will try decoding it.
	710	# Still, this method does not guarantee the return value is the 'original', only
	711	# that the result is where the base64 decoding process has been applied once.
	712	# THIS METHOD IS NOT USED at the moment. It's here for convenience and symmetry.
[18342]	713	sub base64_decode {
	714	my ($text) = @_;
[18379]	715	if(&conforms_to_mod_base64($text)) {
	716	# base64 encodes certain chars with + and /, but if we'd encoded it, we'd
	717	# have replaced them with - and _ respectively. Undo this before decoding.
	718	$text =~ s/\-/\+/g; # - -> +
	719	$text =~ s/\_/\//g; # _ -> /
[18342]	720	$text = &MIME::Base64::decode_base64($text);
	721	}
	722	return $text;
	723	}
	724
[18379]	725	# Returns true if the given string is compatible with a modified version
	726	# of base64 (where the + and / are replaced with - and _), a format which
	727	# includes also regular ASCII alphanumeric values. This method does not
	728	# guarantee that the given string is actually base64 encoded, since it will
	729	# return true for any simple alphanumeric ASCII string as well.
	730	sub conforms_to_mod_base64 {
[18342]	731	my ($text) = @_;
[18410]	732
	733	# need to treat the entity ref for underscore as underscore
	734	$text =~ s/&\#095;/_/g;
	735
[18379]	736	# base 64 takes alphanumeric and [=+/],
	737	# but we use modified base64 where + and / are replaced with - and _
	738	return ($text =~ m/^[A-Za-z0-9\=\-\_]+$/); #alphanumeric and [=-_]
[18342]	739	}
	740
[9125]	741	sub substr
	742	{
	743	my ($utf8_string, $offset, $length) = @_;
	744
	745	my @unicode_string = @{&utf82unicode($utf8_string)};
	746	my $unicode_string_length = scalar(@unicode_string);
	747
	748	my $substr_start = $offset;
	749	if ($substr_start >= $unicode_string_length) {
	750	return "";
	751	}
	752
	753	my $substr_end = $offset + $length - 1;
	754	if ($substr_end >= $unicode_string_length) {
	755	$substr_end = $unicode_string_length - 1;
	756	}
	757
	758	my @unicode_substring = @unicode_string[$substr_start..$substr_end];
	759	return &unicode2utf8(\@unicode_substring);
	760	}
	761
[23285]	762	# Useful method to print UTF8 (or other unicode) for debugging.
	763	# Characters that are easily displayed (that is, printable ASCII)
	764	# are shown as-is, whereas hex values of the unicode code points
	765	# are shown for all other chars.
	766	sub debug_unicode_string
	767	{
	768	join("",
[23831]	769	map { $_ > 127 ? # if wide character...
[23285]	770	sprintf("\\x{%04X}", $_) : # \x{...}
	771	chr($_)
	772	} unpack("U*", $_[0])); # unpack Unicode characters
	773	}
[9125]	774
[23304]	775
	776	sub raw_filename_to_url_encoded
	777	{
	778	my ($str_in) = @_;
	779
	780	my @url_encoded_chars
[23362]	781	= map { $_ > 255 ? # Needs to be represent in entity form
	782	sprintf("&#x%X;",$_) :
[23831]	783	$_>127 \|\| $_==ord("%") ? # Representable in %XX form
[23362]	784	sprintf("%%%2X", $_) :
	785	chr($_) # otherwise, Ascii char
	786	} unpack("U*", $str_in); # Unpack Unicode characters
[23304]	787
	788
	789	my $str_out = join("", @url_encoded_chars);
	790
	791	return $str_out;
	792
	793	}
	794
	795	sub url_encoded_to_raw_filename
	796	{
	797	my ($str_in) = @_;
	798
	799	my $str_out = $str_in;
	800
[23371]	801	$str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
[23362]	802	$str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
	803	$str_out =~ s/&#([0-9]+);/chr($1)/eig;
[23304]	804
	805	return $str_out;
	806	}
	807
[23387]	808
	809	sub raw_filename_to_utf8_url_encoded
	810	{
	811	my ($str_in) = @_;
	812
	813	$str_in = Encode::encode("utf8",$str_in) if !check_is_utf8($str_in);
	814
	815	my @url_encoded_chars
[23831]	816	= map { $_ > 127 ? # Representable in %XX form
[23387]	817	sprintf("%%%2X", $_) :
	818	chr($_) # otherwise, Ascii char
	819	} unpack("U*", $str_in); # Unpack utf8 characters
	820
	821
	822	my $str_out = join("", @url_encoded_chars);
	823
	824	return $str_out;
	825
	826	}
	827
	828	sub utf8_url_encoded_to_raw_filename
	829	{
	830	my ($str_in) = @_;
	831
	832	my $utf8_str_out = $str_in;
	833
	834	$utf8_str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
	835
	836	my $unicode_str_out = decode("utf8",$utf8_str_out);
	837	my $raw_str_out = utf8::downgrade($unicode_str_out);
	838
	839	return $raw_str_out;
	840	}
	841
	842	sub analyze_raw_string
	843	{
	844	my ($str_in) = @_;
	845
	846	my $uses_bytecodes = 0;
	847	my $exceeds_bytecodes = 0;
	848
	849	map { $exceeds_bytecodes = 1 if ($_ >= 256);
	850	$uses_bytecodes = 1 if (($_ >= 128) && ($_ < 256));
	851	} unpack("U*", $str_in); # Unpack Unicode characters
	852
	853	return ($uses_bytecodes,$exceeds_bytecodes);
	854	}
	855
	856
[24193]	857	sub convert_utf8_string_to_unicode_string
	858	{
	859	my $utf8_string = shift(@_);
	860
	861	my $unicode_string = "";
	862	foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
	863	$unicode_string .= chr($unicode_value);
	864	}
	865	return $unicode_string;
	866	}
	867
	868	sub convert_unicode_string_to_utf8_string
	869	{
	870	my $unicode_string = shift(@_);
	871
	872	my @unicode_array;
	873	for (my $i = 0; $i < length($unicode_string); $i++) {
	874	push(@unicode_array, ord(&substr($unicode_string, $i, 1)));
	875	}
	876	return &unicode::unicode2utf8(\@unicode_array);
	877	}
	878
	879
[71]	880	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: