Context Navigation

source: gsdl/trunk/perllib/unicode.pm@ 18338

Last change on this file since 18338 was 18338, checked in by ak19, 15 years ago
Moved the line of code that replaced spaces with underscores to the subroutine util::rename_file, since this replacement ought to be done not only when url_encoding but also when using base64 encoding.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 17.5 KB

Rev	Line
[537]	1	###########################################################################
	2	#
	3	# unicode.pm --
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
[7798]	8	# Copyright (C) 1999-2004 New Zealand Digital Library Project
[537]	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
[71]	26	# useful functions for dealing with Unicode
	27
	28	# Unicode strings are stored as arrays of scalars as perl
	29	# lacks characters are 8-bit (currently)
	30
	31	package unicode;
[3767]	32
[3834]	33	eval {require bytes};
[3767]	34
[1870]	35	use encodings;
[15894]	36	use strict;
[2713]	37	use util;
[71]	38
[18245]	39	no strict 'refs';
	40
[1223]	41	# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
	42	# and returns a unicode array.
[73]	43	sub ascii2unicode {
	44	my ($in) = @_;
	45	my $out = [];
	46
	47	my $i = 0;
	48	my $len = length($in);
	49	while ($i < $len) {
	50	push (@$out, ord(substr ($in, $i, 1)));
	51	$i++;
	52	}
	53
	54	return $out;
	55	}
	56
[1870]	57	# ascii2utf8 takes a reference to an (extended) ascii string and returns a
	58	# UTF-8 encoded string. This is just a faster version of
	59	# "&unicode2utf8(&ascii2unicode($str));"
[7815]	60	# "Extended ascii" really means "iso_8859_1"
[73]	61	sub ascii2utf8 {
	62	my ($in) = @_;
	63	my $out = "";
	64
[2483]	65	if (!defined($in)\|\| !defined($$in)) {
	66	return $out;
	67	}
	68
[73]	69	my ($c);
	70	my $i = 0;
[1870]	71	my $len = length($$in);
[73]	72	while ($i < $len) {
[1870]	73	$c = ord (substr ($$in, $i, 1));
[73]	74	if ($c < 0x80) {
	75	# ascii character
	76	$out .= chr ($c);
	77
	78	} else {
	79	# extended ascii character
	80	$out .= chr (0xc0 + (($c >> 6) & 0x1f));
	81	$out .= chr (0x80 + ($c & 0x3f));
	82	}
	83	$i++;
	84	}
	85
	86	return $out;
	87	}
	88
[71]	89	# unicode2utf8 takes a unicode array as input and encodes it
	90	# using utf-8
	91	sub unicode2utf8 {
	92	my ($in) = @_;
	93	my $out = "";
	94
[8716]	95	foreach my $num (@$in) {
[1868]	96	next unless defined $num;
[71]	97	if ($num < 0x80) {
	98	$out .= chr ($num);
	99
	100	} elsif ($num < 0x800) {
	101	$out .= chr (0xc0 + (($num >> 6) & 0x1f));
	102	$out .= chr (0x80 + ($num & 0x3f));
	103
	104	} elsif ($num < 0xFFFF) {
	105	$out .= chr (0xe0 + (($num >> 12) & 0xf));
	106	$out .= chr (0x80 + (($num >> 6) & 0x3f));
	107	$out .= chr (0x80 + ($num & 0x3f));
	108
	109	} else {
	110	# error, don't encode anything
	111	die;
	112	}
	113	}
	114	return $out;
	115	}
	116
	117	# utf82unicode takes a utf-8 string and produces a unicode
	118	# array
	119	sub utf82unicode {
	120	my ($in) = @_;
	121	my $out = [];
	122
	123	my $i = 0;
	124	my ($c1, $c2, $c3);
[8716]	125	my $len = length($in);
[71]	126	while ($i < $len) {
	127	if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
	128	# normal ascii character
	129	push (@$out, $c1);
	130
	131	} elsif ($c1 < 0xc0) {
	132	# error, was expecting the first byte of an
	133	# encoded character. Do nothing.
	134
	135	} elsif ($c1 < 0xe0 && $i+1 < $len) {
	136	# an encoded character with two bytes
	137	$c2 = ord (substr ($in, $i+1, 1));
	138	if ($c2 >= 0x80 && $c2 < 0xc0) {
	139	# everything looks ok
	140	push (@$out, ((($c1 & 0x1f) << 6) +
	141	($c2 & 0x3f)));
	142	$i++; # gobbled an extra byte
	143	}
	144
	145	} elsif ($c1 < 0xf0 && $i+2 < $len) {
	146	# an encoded character with three bytes
	147	$c2 = ord (substr ($in, $i+1, 1));
	148	$c3 = ord (substr ($in, $i+2, 1));
	149	if ($c2 >= 0x80 && $c2 < 0xc0 &&
	150	$c3 >= 0x80 && $c3 < 0xc0) {
	151	# everything looks ok
	152	push (@$out, ((($c1 & 0xf) << 12) +
	153	(($c2 & 0x3f) << 6) +
	154	($c3 & 0x3f)));
	155
	156	$i += 2; # gobbled an extra two bytes
	157	}
	158
	159	} else {
	160	# error, only decode Unicode characters not full UCS.
	161	# Do nothing.
	162	}
	163
	164	$i++;
	165	}
	166
	167	return $out;
	168	}
	169
	170	# unicode2ucs2 takes a unicode array and produces a UCS-2
	171	# unicode string (every two bytes forms a unicode character)
	172	sub unicode2ucs2 {
	173	my ($in) = @_;
	174	my $out = "";
	175
[8716]	176	foreach my $num (@$in) {
[71]	177	$out .= chr (($num & 0xff00) >> 8);
	178	$out .= chr ($num & 0xff);
	179	}
	180
	181	return $out;
	182	}
	183
	184	# ucs22unicode takes a UCS-2 string and produces a unicode array
	185	sub ucs22unicode {
	186	my ($in) = @_;
	187	my $out = [];
	188
	189	my $i = 0;
	190	my $len = length ($in);
	191	while ($i+1 < $len) {
	192	push (@$out, ord (substr($in, $i, 1)) << 8 +
	193	ord (substr($in, $i+1, 1)));
	194
	195	$i ++;
	196	}
	197
	198	return $out;
	199	}
	200
[1870]	201	# takes a reference to a string and returns a reference to a unicode array
	202	sub convert2unicode {
	203	my ($encoding, $textref) = @_;
[71]	204
[1870]	205	if (!defined $encodings::encodings->{$encoding}) {
	206	print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
	207	return [];
[1227]	208	}
	209
[1870]	210	my $encodename = "$encoding-unicode";
	211	my $enc_info = $encodings::encodings->{$encoding};
	212	my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
	213	"to_uc", $enc_info->{'mapfile'});
	214	if (!&loadmapencoding ($encodename, $mapfile)) {
[10983]	215	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
[1870]	216	return [];
[1227]	217	}
[1870]	218
	219	if (defined $enc_info->{'converter'}) {
	220	my $converter = $enc_info->{'converter'};
	221	return &$converter ($encodename, $textref);
	222	}
[1227]	223
[8716]	224	if ($unicode::translations{$encodename}->{'count'} == 1) {
[1870]	225	return &singlebyte2unicode ($encodename, $textref);
	226	} else {
	227	return &doublebyte2unicode ($encodename, $textref);
	228	}
	229	}
[1227]	230
[1870]	231	# singlebyte2unicode converts simple 8 bit encodings where characters below
	232	# 0x80 are normal ascii characters and the rest are decoded using the
	233	# appropriate mapping files.
	234	#
	235	# Examples of encodings that may be converted using singlebyte2unicode are
[4229]	236	# the iso-8859 and windows-125* series.
[1870]	237	sub singlebyte2unicode {
	238	my ($encodename, $textref) = @_;
[1227]	239
[1870]	240	my @outtext = ();
	241	my $len = length($$textref);
	242	my ($c);
	243	my $i = 0;
[1227]	244
[1870]	245	while ($i < $len) {
	246	if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
	247	# normal ascii character
	248	push (@outtext, $c);
	249	} else {
	250	$c = &transchar ($encodename, $c);
	251	# put a black square if cannot translate
	252	$c = 0x25A1 if $c == 0;
	253	push (@outtext, $c);
	254	}
	255	$i ++;
[1227]	256	}
[1870]	257	return \@outtext;
	258	}
[1227]	259
[1870]	260	# doublebyte2unicode converts simple two byte encodings where characters
	261	# below code point 0x80 are single-byte characters and the rest are
	262	# double-byte characters.
	263	#
	264	# Examples of encodings that may be converted using doublebyte2unicode are
	265	# CJK encodings like GB encoded Chinese and UHC Korean.
	266	#
	267	# Note that no error checking is performed to make sure that the input text
	268	# is valid for the given encoding.
	269	#
	270	# Also, encodings that may contain characters of more than two bytes are
	271	# not supported (any EUC encoded text may in theory contain 3-byte
	272	# characters but in practice only one and two byte characters are used).
	273	sub doublebyte2unicode {
	274	my ($encodename, $textref) = @_;
	275
	276	my @outtext = ();
	277	my $len = length($$textref);
	278	my ($c1, $c2);
	279	my $i = 0;
[1227]	280
[1870]	281	while ($i < $len) {
	282	if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
	283	if ($i+1 < $len) {
	284	# double-byte character
	285	$c2 = ord(substr($$textref, $i+1, 1));
	286	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
	287	# put a black square if cannot translate
	288	$c = 0x25A1 if $c == 0;
	289	push (@outtext, $c);
	290	$i += 2;
	291
	292	} else {
	293	# error
	294	print STDERR "unicode: ERROR missing second half of double-byte character\n";
	295	$i++;
	296	}
	297
	298	} else {
	299	# single-byte character
	300	push (@outtext, $c1);
	301	$i++;
	302	}
	303	}
	304	return \@outtext;
[1227]	305	}
	306
[1870]	307	# Shift-JIS to unicode
	308	# We can't use doublebyte2unicode for Shift-JIS because it uses some
	309	# single-byte characters above code point 0x80 (i.e. half-width katakana
	310	# characters in the range 0xA1-0xDF)
	311	sub shiftjis2unicode {
	312	my ($encodename, $textref) = @_;
	313
	314	my @outtext = ();
	315	my $len = length($$textref);
	316	my ($c1, $c2);
	317	my $i = 0;
[1227]	318
[1870]	319	while ($i < $len) {
	320	$c1 = ord(substr($$textref, $i, 1));
[1868]	321
[1870]	322	if (($c1 >= 0xA1 && $c1 <= 0xDF) \|\| $c1 == 0x5c \|\| $c1 == 0x7E) {
	323	# Single-byte half-width katakana character or
	324	# JIS Roman yen or overline characters
	325	my $c = &transchar ($encodename, $c1);
	326	# - put a black square if cannot translate
	327	$c = 0x25A1 if $c == 0;
	328	push (@outtext, $c);
	329	$i++;
[1868]	330
[1870]	331	} elsif ($c1 < 0x80) {
	332	# ASCII
	333	push (@outtext, $c1);
	334	$i ++;
[1868]	335
[1870]	336	} elsif ($c1 < 0xEF) {
	337	if ($i+1 < $len) {
	338	$c2 = ord(substr($$textref, $i+1, 1));
	339	if (($c2 >= 0x40 && $c2 <= 0x7E) \|\| ($c2 >= 0x80 && $c2 <= 0xFC)) {
	340	# Double-byte shift-jis character
	341	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
	342	# put a black square if cannot translate
	343	$c = 0x25A1 if $c == 0;
	344	push (@outtext, $c);
	345	} else {
	346	# error
	347	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
	348	}
	349	$i += 2;
	350	} else {
	351	# error
	352	print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
	353	$i ++;
	354	}
	355	} else {
	356	# error
	357	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
	358	$i ++;
	359	}
	360	}
	361	return \@outtext;
	362	}
[1868]	363
[1870]	364	sub transchar {
	365	my ($encoding, $from) = @_;
	366	my $high = ($from / 256) % 256;
	367	my $low = $from % 256;
[1868]	368
[8716]	369	return 0 unless defined $unicode::translations{$encoding};
[1868]	370
[8716]	371	my $block = $unicode::translations{$encoding}->{'map'};
[1868]	372
[1870]	373	if (ref ($block->[$high]) ne "ARRAY") {
	374	return 0;
	375	}
	376	return $block->[$high]->[$low];
	377	}
[1868]	378
	379	# %translations is of the form:
	380	#
[1870]	381	# encodings{encodingname-encodingname}->{'map'}->blocktranslation
[1868]	382	# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
	383	#
	384	# Any of the top translation blocks can point to an undefined
	385	# value. This data structure aims to allow fast translation and
	386	# efficient storage.
[8716]	387	%unicode::translations = ();
[1868]	388
	389	# @array256 is used for initialisation, there must be
	390	# a better way...
[4229]	391	# What about this?: @array256 = (0) x 256;
[8716]	392	@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
[1868]	393	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	394	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	395	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	396	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	397	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	398	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	399	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	400	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	401	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	402	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	403	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	404	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	405	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	406	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	407	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
	408
	409	# returns 1 if successful, 0 if unsuccessful
	410	sub loadmapencoding {
	411	my ($encoding, $mapfile) = @_;
	412
	413	# check to see if the encoding has already been loaded
[8716]	414	return 1 if (defined $unicode::translations{$encoding});
[1868]	415
[9967]	416	if (! -r $mapfile \|\| -d $mapfile) {
	417	return 0;
	418	}
[1868]	419	return 0 unless open (MAPFILE, $mapfile);
	420	binmode (MAPFILE);
	421
[8716]	422	$unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
	423	my $block = $unicode::translations{$encoding};
[1868]	424
	425	my ($in,$i,$j);
[9967]	426	while (1) {
	427	my $ret=read(MAPFILE, $in, 1);
	428	if (!defined($ret)) { # error
	429	print STDERR "unicode.pm: error reading mapfile: $!\n";
	430	last;
	431	}
	432	if ($ret != 1) { last }
[1868]	433	$i = unpack ("C", $in);
[8716]	434	$block->{'map'}->[$i] = [@unicode::array256];
[1868]	435	for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
	436	my ($n1, $n2) = unpack ("CC", $in);
[1870]	437	$block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
[1868]	438	}
[1870]	439	$block->{'count'} ++;
[1868]	440	}
	441
	442	close (MAPFILE);
	443	}
	444
[4229]	445	# unicode2singlebyte converts unicode to simple 8 bit encodings where
	446	# characters below 0x80 are normal ascii characters and the rest are encoded
	447	# using the appropriate mapping files.
	448	#
	449	# Examples of encodings that may be converted using unicode2singlebyte are
	450	# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
	451	sub unicode2singlebyte {
	452	my ($uniref, $encoding) = @_;
	453
[2237]	454	my $outtext = "";
[4229]	455	my $encodename = "unicode-$encoding";
[9967]	456
	457	if (!exists $encodings::encodings->{$encoding}) {
	458	print STDERR "unicode.pm: ERROR - unsupported encoding "
	459	. "'$encoding' requested\n";
	460	return "";
	461	}
	462
	463	my $enc_info = $encodings::encodings->{$encoding};
[2237]	464	my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
	465	"from_uc", $enc_info->{'mapfile'});
	466	if (!&loadmapencoding ($encodename, $mapfile)) {
[10983]	467	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
[2237]	468	return "";
	469	}
	470
	471	foreach my $c (@$uniref) {
	472	if ($c < 0x80) {
	473	# normal ascii character
	474	$outtext .= chr($c);
	475	} else {
	476	# extended ascii character
	477	$c = &transchar ($encodename, $c);
	478
	479	# put a question mark if cannot translate
	480	if ($c == 0) {
	481	$outtext .= "?";
	482	} else {
	483	$outtext .= chr($c);
	484	}
	485	}
	486	}
	487	return $outtext;
	488	}
	489
[7798]	490
	491	# this makes sure that the referenced input string is utf8 encoded, and
	492	# will change/remove bytes that aren't.
	493	# returns 0 if the text was already utf8, or 1 if text modified to become utf8
	494	sub ensure_utf8 {
	495	my $stringref=shift;
	496
[8217]	497	if (!defined($stringref) \|\| ref($stringref) ne 'SCALAR') {
	498	return $stringref;
	499	}
	500
[7798]	501	my $value=$$stringref;
	502
	503	my $non_utf8_found = 0;
	504	$value =~ m/^/g; # to set \G
	505	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
	506	my $highbytes=$1;
	507	my $highbyteslength=length($highbytes);
	508	# make sure this block of high bytes is utf-8
	509	$highbytes =~ /^/g; # set pos()
	510	my $byte_replaced = 0;
	511	while ($highbytes =~
[8903]	512	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
[7798]	513	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
[8903]	514	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
	515	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
	516	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
[7798]	517	)*([\x80-\xff])? !xg
	518	) {
	519	# this highbyte is "out-of-place" for valid utf-8
	520	my $badbyte=$1;
	521	if (!defined $badbyte) {next} # hit end of string
	522	my $pos=pos($highbytes);
	523	# replace bad byte. assume iso-8859-1 -> utf-8
	524	# ascii2utf8 does "extended ascii"... ie iso-8859-1
	525	my $replacement=&unicode::ascii2utf8(\$badbyte);
	526	substr($highbytes, $pos-1, 1, $replacement);
	527	# update the position to continue searching (for \G)
	528	pos($highbytes) = $pos+length($replacement)-1;
	529	$byte_replaced = 1;
	530	}
	531	if ($byte_replaced) {
	532	# replace this block of high bytes in the $value
	533	$non_utf8_found = 1;
	534	my $replength=length($highbytes); # we've changed the length
	535	my $textpos=pos($value); # pos at end of last match
	536	# replace bad bytes with good bytes
	537	substr($value, $textpos-$highbyteslength,
	538	$highbyteslength, $highbytes);
	539	# update the position to continue searching (for \G)
	540	pos($value)=$textpos+($replength-$highbyteslength)+1;
	541	}
	542	}
	543
[7817]	544	$$stringref = $value;
[7798]	545	return $non_utf8_found;
	546	}
	547
[16553]	548	# Returns true (1) if the given string is utf8 and false (0) if it isn't.
	549	# Does not modify the string parameter.
	550	sub check_is_utf8 {
	551	my $value=shift;
	552
	553	if (!defined($value)) {
	554	return 0; # not utf8 because it is undefined
	555	}
	556
	557	$value =~ m/^/g; # to set \G
	558	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
	559	my $highbytes=$1;
	560	# make sure this block of high bytes is utf-8
	561	$highbytes =~ /^/g; # set pos()
	562	while ($highbytes =~
	563	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
	564	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
	565	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
	566	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
	567	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
	568	)*([\x80-\xff])? !xg
	569	) {
	570	my $badbyte=$1;
	571	if (defined $badbyte) { # not end of string
	572	return 0; # non-utf8 found
	573	}
	574	}
	575	}
	576
	577	return 1;
	578	}
	579
[16672]	580	sub url_encode {
	581	my ($text) = @_;
[18338]	582
[16900]	583	if (!&is_url_encoded($text)) {
[18201]	584	$text =~ s/([^A-Z0-9\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
[16900]	585	}
[16672]	586	return $text;
	587	}
	588
	589	sub url_decode {
	590	my ($text) = @_;
	591
	592	$text =~ s/\%([A-F0-9]{2})/pack('C', hex($1))/ige;
	593	return $text;
	594	}
	595
[16900]	596	sub is_url_encoded {
	597	my ($text) = @_;
	598	return ($text =~ m/\%([A-F0-9]{2})/);
	599	}
	600
[16918]	601	# When a filename on the filesystem is already URL-encoded, the
[18319]	602	# URL to it will have %25s in place of every % sign, so that
[16918]	603	# URLs in html pages can refer to the URL-encoded filename.
	604	# This method changes the URL reference back into the actual
	605	# (URL-encoded) filename on the filesystem by replacing %25 with %.
	606	sub url_to_filename {
	607	my ($text) =@_;
[18319]	608	$text =~ s/%25/%/g if is_url_encoded($text);
[16933]	609	return $text;
[16918]	610	}
	611
	612	# When a filename on the filesystem is already URL-encoded, the
	613	# URL to it will have %25s in in place of every % sign, so that
	614	# URLs in html pages can refer to the URL-encoded filename.
	615	# Given a (URL-encoded) filename on the filesystem, this subroutine
	616	# returns the URL reference string for it by replacing % with %25.
	617	# The output string will be the same as the input string if the input
	618	# already contains one or more %25s. This is to prevent processing
	619	# a url more than once this way.
	620	sub filename_to_url {
[18319]	621	my ($text, $rename_method) = @_;
[16918]	622
[18319]	623	if(!defined $rename_method \|\| $rename_method eq "url") {
	624	if($text !~ m/%25/) {
	625	$text =~ s/%/%25/g;
	626	}
[16918]	627	}
[16933]	628	return $text;
[16918]	629	}
	630
	631
[9125]	632	sub substr
	633	{
	634	my ($utf8_string, $offset, $length) = @_;
	635
	636	my @unicode_string = @{&utf82unicode($utf8_string)};
	637	my $unicode_string_length = scalar(@unicode_string);
	638
	639	my $substr_start = $offset;
	640	if ($substr_start >= $unicode_string_length) {
	641	return "";
	642	}
	643
	644	my $substr_end = $offset + $length - 1;
	645	if ($substr_end >= $unicode_string_length) {
	646	$substr_end = $unicode_string_length - 1;
	647	}
	648
	649	my @unicode_substring = @unicode_string[$substr_start..$substr_end];
	650	return &unicode2utf8(\@unicode_substring);
	651	}
	652
	653
[71]	654	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: