Context Navigation

source: main/trunk/greenstone2/perllib/unicode.pm@ 21411

Last change on this file since 21411 was 18410, checked in by ak19, 15 years ago
Some changes made in url and base64 encoding to deal with when the character entity for underscore had replaced the underscore (after dmsafing a filename so it won't cause conflicts with GS macros).
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 19.5 KB

Line
1	###########################################################################
2	#
3	# unicode.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999-2004 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# useful functions for dealing with Unicode
27
28	# Unicode strings are stored as arrays of scalars as perl
29	# lacks characters are 8-bit (currently)
30
31	package unicode;
32
33	eval {require bytes};
34
35	use encodings;
36	use strict;
37	use util;
38	use MIME::Base64; # for base64 encoding
39
40	no strict 'refs';
41
42	# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
43	# and returns a unicode array.
44	sub ascii2unicode {
45	my ($in) = @_;
46	my $out = [];
47
48	my $i = 0;
49	my $len = length($in);
50	while ($i < $len) {
51	push (@$out, ord(substr ($in, $i, 1)));
52	$i++;
53	}
54
55	return $out;
56	}
57
58	# ascii2utf8 takes a reference to an (extended) ascii string and returns a
59	# UTF-8 encoded string. This is just a faster version of
60	# "&unicode2utf8(&ascii2unicode($str));"
61	# "Extended ascii" really means "iso_8859_1"
62	sub ascii2utf8 {
63	my ($in) = @_;
64	my $out = "";
65
66	if (!defined($in)\|\| !defined($$in)) {
67	return $out;
68	}
69
70	my ($c);
71	my $i = 0;
72	my $len = length($$in);
73	while ($i < $len) {
74	$c = ord (substr ($$in, $i, 1));
75	if ($c < 0x80) {
76	# ascii character
77	$out .= chr ($c);
78
79	} else {
80	# extended ascii character
81	$out .= chr (0xc0 + (($c >> 6) & 0x1f));
82	$out .= chr (0x80 + ($c & 0x3f));
83	}
84	$i++;
85	}
86
87	return $out;
88	}
89
90	# unicode2utf8 takes a unicode array as input and encodes it
91	# using utf-8
92	sub unicode2utf8 {
93	my ($in) = @_;
94	my $out = "";
95
96	foreach my $num (@$in) {
97	next unless defined $num;
98	if ($num < 0x80) {
99	$out .= chr ($num);
100
101	} elsif ($num < 0x800) {
102	$out .= chr (0xc0 + (($num >> 6) & 0x1f));
103	$out .= chr (0x80 + ($num & 0x3f));
104
105	} elsif ($num < 0xFFFF) {
106	$out .= chr (0xe0 + (($num >> 12) & 0xf));
107	$out .= chr (0x80 + (($num >> 6) & 0x3f));
108	$out .= chr (0x80 + ($num & 0x3f));
109
110	} else {
111	# error, don't encode anything
112	die;
113	}
114	}
115	return $out;
116	}
117
118	# utf82unicode takes a utf-8 string and produces a unicode
119	# array
120	sub utf82unicode {
121	my ($in) = @_;
122	my $out = [];
123
124	my $i = 0;
125	my ($c1, $c2, $c3);
126	my $len = length($in);
127	while ($i < $len) {
128	if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
129	# normal ascii character
130	push (@$out, $c1);
131
132	} elsif ($c1 < 0xc0) {
133	# error, was expecting the first byte of an
134	# encoded character. Do nothing.
135
136	} elsif ($c1 < 0xe0 && $i+1 < $len) {
137	# an encoded character with two bytes
138	$c2 = ord (substr ($in, $i+1, 1));
139	if ($c2 >= 0x80 && $c2 < 0xc0) {
140	# everything looks ok
141	push (@$out, ((($c1 & 0x1f) << 6) +
142	($c2 & 0x3f)));
143	$i++; # gobbled an extra byte
144	}
145
146	} elsif ($c1 < 0xf0 && $i+2 < $len) {
147	# an encoded character with three bytes
148	$c2 = ord (substr ($in, $i+1, 1));
149	$c3 = ord (substr ($in, $i+2, 1));
150	if ($c2 >= 0x80 && $c2 < 0xc0 &&
151	$c3 >= 0x80 && $c3 < 0xc0) {
152	# everything looks ok
153	push (@$out, ((($c1 & 0xf) << 12) +
154	(($c2 & 0x3f) << 6) +
155	($c3 & 0x3f)));
156
157	$i += 2; # gobbled an extra two bytes
158	}
159
160	} else {
161	# error, only decode Unicode characters not full UCS.
162	# Do nothing.
163	}
164
165	$i++;
166	}
167
168	return $out;
169	}
170
171	# unicode2ucs2 takes a unicode array and produces a UCS-2
172	# unicode string (every two bytes forms a unicode character)
173	sub unicode2ucs2 {
174	my ($in) = @_;
175	my $out = "";
176
177	foreach my $num (@$in) {
178	$out .= chr (($num & 0xff00) >> 8);
179	$out .= chr ($num & 0xff);
180	}
181
182	return $out;
183	}
184
185	# ucs22unicode takes a UCS-2 string and produces a unicode array
186	sub ucs22unicode {
187	my ($in) = @_;
188	my $out = [];
189
190	my $i = 0;
191	my $len = length ($in);
192	while ($i+1 < $len) {
193	push (@$out, ord (substr($in, $i, 1)) << 8 +
194	ord (substr($in, $i+1, 1)));
195
196	$i ++;
197	}
198
199	return $out;
200	}
201
202	# takes a reference to a string and returns a reference to a unicode array
203	sub convert2unicode {
204	my ($encoding, $textref) = @_;
205
206	if (!defined $encodings::encodings->{$encoding}) {
207	print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
208	return [];
209	}
210
211	my $encodename = "$encoding-unicode";
212	my $enc_info = $encodings::encodings->{$encoding};
213	my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
214	"to_uc", $enc_info->{'mapfile'});
215	if (!&loadmapencoding ($encodename, $mapfile)) {
216	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
217	return [];
218	}
219
220	if (defined $enc_info->{'converter'}) {
221	my $converter = $enc_info->{'converter'};
222	return &$converter ($encodename, $textref);
223	}
224
225	if ($unicode::translations{$encodename}->{'count'} == 1) {
226	return &singlebyte2unicode ($encodename, $textref);
227	} else {
228	return &doublebyte2unicode ($encodename, $textref);
229	}
230	}
231
232	# singlebyte2unicode converts simple 8 bit encodings where characters below
233	# 0x80 are normal ascii characters and the rest are decoded using the
234	# appropriate mapping files.
235	#
236	# Examples of encodings that may be converted using singlebyte2unicode are
237	# the iso-8859 and windows-125* series.
238	sub singlebyte2unicode {
239	my ($encodename, $textref) = @_;
240
241	my @outtext = ();
242	my $len = length($$textref);
243	my ($c);
244	my $i = 0;
245
246	while ($i < $len) {
247	if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
248	# normal ascii character
249	push (@outtext, $c);
250	} else {
251	$c = &transchar ($encodename, $c);
252	# put a black square if cannot translate
253	$c = 0x25A1 if $c == 0;
254	push (@outtext, $c);
255	}
256	$i ++;
257	}
258	return \@outtext;
259	}
260
261	# doublebyte2unicode converts simple two byte encodings where characters
262	# below code point 0x80 are single-byte characters and the rest are
263	# double-byte characters.
264	#
265	# Examples of encodings that may be converted using doublebyte2unicode are
266	# CJK encodings like GB encoded Chinese and UHC Korean.
267	#
268	# Note that no error checking is performed to make sure that the input text
269	# is valid for the given encoding.
270	#
271	# Also, encodings that may contain characters of more than two bytes are
272	# not supported (any EUC encoded text may in theory contain 3-byte
273	# characters but in practice only one and two byte characters are used).
274	sub doublebyte2unicode {
275	my ($encodename, $textref) = @_;
276
277	my @outtext = ();
278	my $len = length($$textref);
279	my ($c1, $c2);
280	my $i = 0;
281
282	while ($i < $len) {
283	if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
284	if ($i+1 < $len) {
285	# double-byte character
286	$c2 = ord(substr($$textref, $i+1, 1));
287	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
288	# put a black square if cannot translate
289	$c = 0x25A1 if $c == 0;
290	push (@outtext, $c);
291	$i += 2;
292
293	} else {
294	# error
295	print STDERR "unicode: ERROR missing second half of double-byte character\n";
296	$i++;
297	}
298
299	} else {
300	# single-byte character
301	push (@outtext, $c1);
302	$i++;
303	}
304	}
305	return \@outtext;
306	}
307
308	# Shift-JIS to unicode
309	# We can't use doublebyte2unicode for Shift-JIS because it uses some
310	# single-byte characters above code point 0x80 (i.e. half-width katakana
311	# characters in the range 0xA1-0xDF)
312	sub shiftjis2unicode {
313	my ($encodename, $textref) = @_;
314
315	my @outtext = ();
316	my $len = length($$textref);
317	my ($c1, $c2);
318	my $i = 0;
319
320	while ($i < $len) {
321	$c1 = ord(substr($$textref, $i, 1));
322
323	if (($c1 >= 0xA1 && $c1 <= 0xDF) \|\| $c1 == 0x5c \|\| $c1 == 0x7E) {
324	# Single-byte half-width katakana character or
325	# JIS Roman yen or overline characters
326	my $c = &transchar ($encodename, $c1);
327	# - put a black square if cannot translate
328	$c = 0x25A1 if $c == 0;
329	push (@outtext, $c);
330	$i++;
331
332	} elsif ($c1 < 0x80) {
333	# ASCII
334	push (@outtext, $c1);
335	$i ++;
336
337	} elsif ($c1 < 0xEF) {
338	if ($i+1 < $len) {
339	$c2 = ord(substr($$textref, $i+1, 1));
340	if (($c2 >= 0x40 && $c2 <= 0x7E) \|\| ($c2 >= 0x80 && $c2 <= 0xFC)) {
341	# Double-byte shift-jis character
342	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
343	# put a black square if cannot translate
344	$c = 0x25A1 if $c == 0;
345	push (@outtext, $c);
346	} else {
347	# error
348	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
349	}
350	$i += 2;
351	} else {
352	# error
353	print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
354	$i ++;
355	}
356	} else {
357	# error
358	print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
359	$i ++;
360	}
361	}
362	return \@outtext;
363	}
364
365	sub transchar {
366	my ($encoding, $from) = @_;
367	my $high = ($from / 256) % 256;
368	my $low = $from % 256;
369
370	return 0 unless defined $unicode::translations{$encoding};
371
372	my $block = $unicode::translations{$encoding}->{'map'};
373
374	if (ref ($block->[$high]) ne "ARRAY") {
375	return 0;
376	}
377	return $block->[$high]->[$low];
378	}
379
380	# %translations is of the form:
381	#
382	# encodings{encodingname-encodingname}->{'map'}->blocktranslation
383	# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
384	#
385	# Any of the top translation blocks can point to an undefined
386	# value. This data structure aims to allow fast translation and
387	# efficient storage.
388	%unicode::translations = ();
389
390	# @array256 is used for initialisation, there must be
391	# a better way...
392	# What about this?: @array256 = (0) x 256;
393	@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
394	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
395	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
396	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
397	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
398	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
399	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
400	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
401	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
402	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
403	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
404	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
405	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
406	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
407	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
408	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
409
410	# returns 1 if successful, 0 if unsuccessful
411	sub loadmapencoding {
412	my ($encoding, $mapfile) = @_;
413
414	# check to see if the encoding has already been loaded
415	return 1 if (defined $unicode::translations{$encoding});
416
417	if (! -r $mapfile \|\| -d $mapfile) {
418	return 0;
419	}
420	return 0 unless open (MAPFILE, $mapfile);
421	binmode (MAPFILE);
422
423	$unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
424	my $block = $unicode::translations{$encoding};
425
426	my ($in,$i,$j);
427	while (1) {
428	my $ret=read(MAPFILE, $in, 1);
429	if (!defined($ret)) { # error
430	print STDERR "unicode.pm: error reading mapfile: $!\n";
431	last;
432	}
433	if ($ret != 1) { last }
434	$i = unpack ("C", $in);
435	$block->{'map'}->[$i] = [@unicode::array256];
436	for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
437	my ($n1, $n2) = unpack ("CC", $in);
438	$block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
439	}
440	$block->{'count'} ++;
441	}
442
443	close (MAPFILE);
444	}
445
446	# unicode2singlebyte converts unicode to simple 8 bit encodings where
447	# characters below 0x80 are normal ascii characters and the rest are encoded
448	# using the appropriate mapping files.
449	#
450	# Examples of encodings that may be converted using unicode2singlebyte are
451	# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
452	sub unicode2singlebyte {
453	my ($uniref, $encoding) = @_;
454
455	my $outtext = "";
456	my $encodename = "unicode-$encoding";
457
458	if (!exists $encodings::encodings->{$encoding}) {
459	print STDERR "unicode.pm: ERROR - unsupported encoding "
460	. "'$encoding' requested\n";
461	return "";
462	}
463
464	my $enc_info = $encodings::encodings->{$encoding};
465	my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
466	"from_uc", $enc_info->{'mapfile'});
467	if (!&loadmapencoding ($encodename, $mapfile)) {
468	print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
469	return "";
470	}
471
472	foreach my $c (@$uniref) {
473	if ($c < 0x80) {
474	# normal ascii character
475	$outtext .= chr($c);
476	} else {
477	# extended ascii character
478	$c = &transchar ($encodename, $c);
479
480	# put a question mark if cannot translate
481	if ($c == 0) {
482	$outtext .= "?";
483	} else {
484	$outtext .= chr($c);
485	}
486	}
487	}
488	return $outtext;
489	}
490
491
492	# this makes sure that the referenced input string is utf8 encoded, and
493	# will change/remove bytes that aren't.
494	# returns 0 if the text was already utf8, or 1 if text modified to become utf8
495	sub ensure_utf8 {
496	my $stringref=shift;
497
498	if (!defined($stringref) \|\| ref($stringref) ne 'SCALAR') {
499	return $stringref;
500	}
501
502	my $value=$$stringref;
503
504	my $non_utf8_found = 0;
505	$value =~ m/^/g; # to set \G
506	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
507	my $highbytes=$1;
508	my $highbyteslength=length($highbytes);
509	# make sure this block of high bytes is utf-8
510	$highbytes =~ /^/g; # set pos()
511	my $byte_replaced = 0;
512	while ($highbytes =~
513	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
514	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
515	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
516	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
517	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
518	)*([\x80-\xff])? !xg
519	) {
520	# this highbyte is "out-of-place" for valid utf-8
521	my $badbyte=$1;
522	if (!defined $badbyte) {next} # hit end of string
523	my $pos=pos($highbytes);
524	# replace bad byte. assume iso-8859-1 -> utf-8
525	# ascii2utf8 does "extended ascii"... ie iso-8859-1
526	my $replacement=&unicode::ascii2utf8(\$badbyte);
527	substr($highbytes, $pos-1, 1, $replacement);
528	# update the position to continue searching (for \G)
529	pos($highbytes) = $pos+length($replacement)-1;
530	$byte_replaced = 1;
531	}
532	if ($byte_replaced) {
533	# replace this block of high bytes in the $value
534	$non_utf8_found = 1;
535	my $replength=length($highbytes); # we've changed the length
536	my $textpos=pos($value); # pos at end of last match
537	# replace bad bytes with good bytes
538	substr($value, $textpos-$highbyteslength,
539	$highbyteslength, $highbytes);
540	# update the position to continue searching (for \G)
541	pos($value)=$textpos+($replength-$highbyteslength)+1;
542	}
543	}
544
545	$$stringref = $value;
546	return $non_utf8_found;
547	}
548
549	# Returns true (1) if the given string is utf8 and false (0) if it isn't.
550	# Does not modify the string parameter.
551	sub check_is_utf8 {
552	my $value=shift;
553
554	if (!defined($value)) {
555	return 0; # not utf8 because it is undefined
556	}
557
558	$value =~ m/^/g; # to set \G
559	while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
560	my $highbytes=$1;
561	# make sure this block of high bytes is utf-8
562	$highbytes =~ /^/g; # set pos()
563	while ($highbytes =~
564	m!\G (?: [\xc0-\xdf][\x80-\xbf] \| # 2 byte utf-8
565	[\xe0-\xef][\x80-\xbf]{2} \| # 3 byte
566	[\xf0-\xf7][\x80-\xbf]{3} \| # 4 byte
567	[\xf8-\xfb][\x80-\xbf]{4} \| # 5 byte
568	[\xfc-\xfd][\x80-\xbf]{5} \| # 6 byte
569	)*([\x80-\xff])? !xg
570	) {
571	my $badbyte=$1;
572	if (defined $badbyte) { # not end of string
573	return 0; # non-utf8 found
574	}
575	}
576	}
577
578	return 1;
579	}
580
581	sub url_encode {
582	my ($text) = @_;
583
584	if (!&is_url_encoded($text)) {
585	$text =~ s/([^A-Z0-9\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
586	# return the url-encoded character entity for underscore back to the entity
587	$text =~ s/%26%23095%3B/&\#095;/g;
588	}
589	return $text;
590	}
591
592	sub url_decode {
593	my ($text) = @_;
594
595	$text =~ s/\%([A-F0-9]{2})/pack('C', hex($1))/ige;
596	return $text;
597	}
598
599	sub is_url_encoded {
600	my ($text) = @_;
601	return ($text =~ m/\%([A-F0-9]{2})/);
602	}
603
604	# When a filename on the filesystem is already URL-encoded, the
605	# URL to it will have %25s in place of every % sign, so that
606	# URLs in html pages can refer to the URL-encoded filename.
607	# This method changes the URL reference back into the actual
608	# (URL-encoded) filename on the filesystem by replacing %25 with %.
609	sub url_to_filename {
610	my ($text) =@_;
611	$text =~ s/%25/%/g if &is_url_encoded($text);
612	# DM safing would have replaced underscores with character entity _
613	# in SourceFile meta. Undo any such change to get the filename referred to.
614	$text =~ s/&\#095;/_/g;
615	return $text;
616	}
617
618	# When a filename on the filesystem is already URL-encoded, the
619	# URL to it will have %25s in place of every % sign, so that
620	# URLs in html pages can refer to the URL-encoded filename.
621	# Given a (URL-encoded) filename on the filesystem, this subroutine
622	# returns the URL reference string for it by replacing % with %25.
623	# The output string will be the same as the input string if the input
624	# already contains one or more %25s. This is to prevent processing
625	# a url more than once this way.
626	sub filename_to_url {
627	my ($text) = @_;
628
629	if($text !~ m/%25/) {
630	$text =~ s/%/%25/g;
631	}
632	return $text;
633	}
634
635	sub base64_encode {
636	my ($text) = @_;
637	if(!&conforms_to_mod_base64($text)) {
638	# return entity for underscore to underscore before encoding
639	$text =~ s/&\#095;/_/g;
640
641	$text = &MIME::Base64::encode_base64($text);
642	# base64 encoding may introduce + and / signs,
643	# replacing them with - and _ to ensure it's filename-safe
644	$text =~ s/\+/\-/g; # + -> -
645	$text =~ s/\//\_/g; # / -> _
646	}
647	return $text;
648	}
649
650	# If the input fits the modified base64 pattern, this will try decoding it.
651	# Still, this method does not guarantee the return value is the 'original', only
652	# that the result is where the base64 decoding process has been applied once.
653	# THIS METHOD IS NOT USED at the moment. It's here for convenience and symmetry.
654	sub base64_decode {
655	my ($text) = @_;
656	if(&conforms_to_mod_base64($text)) {
657	# base64 encodes certain chars with + and /, but if we'd encoded it, we'd
658	# have replaced them with - and _ respectively. Undo this before decoding.
659	$text =~ s/\-/\+/g; # - -> +
660	$text =~ s/\_/\//g; # _ -> /
661	$text = &MIME::Base64::decode_base64($text);
662	}
663	return $text;
664	}
665
666	# Returns true if the given string is compatible with a modified version
667	# of base64 (where the + and / are replaced with - and _), a format which
668	# includes also regular ASCII alphanumeric values. This method does not
669	# guarantee that the given string is actually base64 encoded, since it will
670	# return true for any simple alphanumeric ASCII string as well.
671	sub conforms_to_mod_base64 {
672	my ($text) = @_;
673
674	# need to treat the entity ref for underscore as underscore
675	$text =~ s/&\#095;/_/g;
676
677	# base 64 takes alphanumeric and [=+/],
678	# but we use modified base64 where + and / are replaced with - and _
679	return ($text =~ m/^[A-Za-z0-9\=\-\_]+$/); #alphanumeric and [=-_]
680	}
681
682	sub substr
683	{
684	my ($utf8_string, $offset, $length) = @_;
685
686	my @unicode_string = @{&utf82unicode($utf8_string)};
687	my $unicode_string_length = scalar(@unicode_string);
688
689	my $substr_start = $offset;
690	if ($substr_start >= $unicode_string_length) {
691	return "";
692	}
693
694	my $substr_end = $offset + $length - 1;
695	if ($substr_end >= $unicode_string_length) {
696	$substr_end = $unicode_string_length - 1;
697	}
698
699	my @unicode_substring = @unicode_string[$substr_start..$substr_end];
700	return &unicode2utf8(\@unicode_substring);
701	}
702
703
704	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: