source: main/trunk/greenstone2/perllib/unicode.pm@ 21411

Last change on this file since 21411 was 18410, checked in by ak19, 15 years ago

Some changes made in url and base64 encoding to deal with when the character entity for underscore had replaced the underscore (after dmsafing a filename so it won't cause conflicts with GS macros).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 19.5 KB
Line 
1###########################################################################
2#
3# unicode.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2004 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# useful functions for dealing with Unicode
27
28# Unicode strings are stored as arrays of scalars as perl
29# lacks characters are 8-bit (currently)
30
31package unicode;
32
33eval {require bytes};
34
35use encodings;
36use strict;
37use util;
38use MIME::Base64; # for base64 encoding
39
40no strict 'refs';
41
42# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
43# and returns a unicode array.
44sub ascii2unicode {
45 my ($in) = @_;
46 my $out = [];
47
48 my $i = 0;
49 my $len = length($in);
50 while ($i < $len) {
51 push (@$out, ord(substr ($in, $i, 1)));
52 $i++;
53 }
54
55 return $out;
56}
57
58# ascii2utf8 takes a reference to an (extended) ascii string and returns a
59# UTF-8 encoded string. This is just a faster version of
60# "&unicode2utf8(&ascii2unicode($str));"
61# "Extended ascii" really means "iso_8859_1"
62sub ascii2utf8 {
63 my ($in) = @_;
64 my $out = "";
65
66 if (!defined($in)|| !defined($$in)) {
67 return $out;
68 }
69
70 my ($c);
71 my $i = 0;
72 my $len = length($$in);
73 while ($i < $len) {
74 $c = ord (substr ($$in, $i, 1));
75 if ($c < 0x80) {
76 # ascii character
77 $out .= chr ($c);
78
79 } else {
80 # extended ascii character
81 $out .= chr (0xc0 + (($c >> 6) & 0x1f));
82 $out .= chr (0x80 + ($c & 0x3f));
83 }
84 $i++;
85 }
86
87 return $out;
88}
89
90# unicode2utf8 takes a unicode array as input and encodes it
91# using utf-8
92sub unicode2utf8 {
93 my ($in) = @_;
94 my $out = "";
95
96 foreach my $num (@$in) {
97 next unless defined $num;
98 if ($num < 0x80) {
99 $out .= chr ($num);
100
101 } elsif ($num < 0x800) {
102 $out .= chr (0xc0 + (($num >> 6) & 0x1f));
103 $out .= chr (0x80 + ($num & 0x3f));
104
105 } elsif ($num < 0xFFFF) {
106 $out .= chr (0xe0 + (($num >> 12) & 0xf));
107 $out .= chr (0x80 + (($num >> 6) & 0x3f));
108 $out .= chr (0x80 + ($num & 0x3f));
109
110 } else {
111 # error, don't encode anything
112 die;
113 }
114 }
115 return $out;
116}
117
118# utf82unicode takes a utf-8 string and produces a unicode
119# array
120sub utf82unicode {
121 my ($in) = @_;
122 my $out = [];
123
124 my $i = 0;
125 my ($c1, $c2, $c3);
126 my $len = length($in);
127 while ($i < $len) {
128 if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
129 # normal ascii character
130 push (@$out, $c1);
131
132 } elsif ($c1 < 0xc0) {
133 # error, was expecting the first byte of an
134 # encoded character. Do nothing.
135
136 } elsif ($c1 < 0xe0 && $i+1 < $len) {
137 # an encoded character with two bytes
138 $c2 = ord (substr ($in, $i+1, 1));
139 if ($c2 >= 0x80 && $c2 < 0xc0) {
140 # everything looks ok
141 push (@$out, ((($c1 & 0x1f) << 6) +
142 ($c2 & 0x3f)));
143 $i++; # gobbled an extra byte
144 }
145
146 } elsif ($c1 < 0xf0 && $i+2 < $len) {
147 # an encoded character with three bytes
148 $c2 = ord (substr ($in, $i+1, 1));
149 $c3 = ord (substr ($in, $i+2, 1));
150 if ($c2 >= 0x80 && $c2 < 0xc0 &&
151 $c3 >= 0x80 && $c3 < 0xc0) {
152 # everything looks ok
153 push (@$out, ((($c1 & 0xf) << 12) +
154 (($c2 & 0x3f) << 6) +
155 ($c3 & 0x3f)));
156
157 $i += 2; # gobbled an extra two bytes
158 }
159
160 } else {
161 # error, only decode Unicode characters not full UCS.
162 # Do nothing.
163 }
164
165 $i++;
166 }
167
168 return $out;
169}
170
171# unicode2ucs2 takes a unicode array and produces a UCS-2
172# unicode string (every two bytes forms a unicode character)
173sub unicode2ucs2 {
174 my ($in) = @_;
175 my $out = "";
176
177 foreach my $num (@$in) {
178 $out .= chr (($num & 0xff00) >> 8);
179 $out .= chr ($num & 0xff);
180 }
181
182 return $out;
183}
184
185# ucs22unicode takes a UCS-2 string and produces a unicode array
186sub ucs22unicode {
187 my ($in) = @_;
188 my $out = [];
189
190 my $i = 0;
191 my $len = length ($in);
192 while ($i+1 < $len) {
193 push (@$out, ord (substr($in, $i, 1)) << 8 +
194 ord (substr($in, $i+1, 1)));
195
196 $i ++;
197 }
198
199 return $out;
200}
201
202# takes a reference to a string and returns a reference to a unicode array
203sub convert2unicode {
204 my ($encoding, $textref) = @_;
205
206 if (!defined $encodings::encodings->{$encoding}) {
207 print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
208 return [];
209 }
210
211 my $encodename = "$encoding-unicode";
212 my $enc_info = $encodings::encodings->{$encoding};
213 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
214 "to_uc", $enc_info->{'mapfile'});
215 if (!&loadmapencoding ($encodename, $mapfile)) {
216 print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
217 return [];
218 }
219
220 if (defined $enc_info->{'converter'}) {
221 my $converter = $enc_info->{'converter'};
222 return &$converter ($encodename, $textref);
223 }
224
225 if ($unicode::translations{$encodename}->{'count'} == 1) {
226 return &singlebyte2unicode ($encodename, $textref);
227 } else {
228 return &doublebyte2unicode ($encodename, $textref);
229 }
230}
231
232# singlebyte2unicode converts simple 8 bit encodings where characters below
233# 0x80 are normal ascii characters and the rest are decoded using the
234# appropriate mapping files.
235#
236# Examples of encodings that may be converted using singlebyte2unicode are
237# the iso-8859 and windows-125* series.
238sub singlebyte2unicode {
239 my ($encodename, $textref) = @_;
240
241 my @outtext = ();
242 my $len = length($$textref);
243 my ($c);
244 my $i = 0;
245
246 while ($i < $len) {
247 if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
248 # normal ascii character
249 push (@outtext, $c);
250 } else {
251 $c = &transchar ($encodename, $c);
252 # put a black square if cannot translate
253 $c = 0x25A1 if $c == 0;
254 push (@outtext, $c);
255 }
256 $i ++;
257 }
258 return \@outtext;
259}
260
261# doublebyte2unicode converts simple two byte encodings where characters
262# below code point 0x80 are single-byte characters and the rest are
263# double-byte characters.
264#
265# Examples of encodings that may be converted using doublebyte2unicode are
266# CJK encodings like GB encoded Chinese and UHC Korean.
267#
268# Note that no error checking is performed to make sure that the input text
269# is valid for the given encoding.
270#
271# Also, encodings that may contain characters of more than two bytes are
272# not supported (any EUC encoded text may in theory contain 3-byte
273# characters but in practice only one and two byte characters are used).
274sub doublebyte2unicode {
275 my ($encodename, $textref) = @_;
276
277 my @outtext = ();
278 my $len = length($$textref);
279 my ($c1, $c2);
280 my $i = 0;
281
282 while ($i < $len) {
283 if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
284 if ($i+1 < $len) {
285 # double-byte character
286 $c2 = ord(substr($$textref, $i+1, 1));
287 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
288 # put a black square if cannot translate
289 $c = 0x25A1 if $c == 0;
290 push (@outtext, $c);
291 $i += 2;
292
293 } else {
294 # error
295 print STDERR "unicode: ERROR missing second half of double-byte character\n";
296 $i++;
297 }
298
299 } else {
300 # single-byte character
301 push (@outtext, $c1);
302 $i++;
303 }
304 }
305 return \@outtext;
306}
307
308# Shift-JIS to unicode
309# We can't use doublebyte2unicode for Shift-JIS because it uses some
310# single-byte characters above code point 0x80 (i.e. half-width katakana
311# characters in the range 0xA1-0xDF)
312sub shiftjis2unicode {
313 my ($encodename, $textref) = @_;
314
315 my @outtext = ();
316 my $len = length($$textref);
317 my ($c1, $c2);
318 my $i = 0;
319
320 while ($i < $len) {
321 $c1 = ord(substr($$textref, $i, 1));
322
323 if (($c1 >= 0xA1 && $c1 <= 0xDF) || $c1 == 0x5c || $c1 == 0x7E) {
324 # Single-byte half-width katakana character or
325 # JIS Roman yen or overline characters
326 my $c = &transchar ($encodename, $c1);
327 # - put a black square if cannot translate
328 $c = 0x25A1 if $c == 0;
329 push (@outtext, $c);
330 $i++;
331
332 } elsif ($c1 < 0x80) {
333 # ASCII
334 push (@outtext, $c1);
335 $i ++;
336
337 } elsif ($c1 < 0xEF) {
338 if ($i+1 < $len) {
339 $c2 = ord(substr($$textref, $i+1, 1));
340 if (($c2 >= 0x40 && $c2 <= 0x7E) || ($c2 >= 0x80 && $c2 <= 0xFC)) {
341 # Double-byte shift-jis character
342 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
343 # put a black square if cannot translate
344 $c = 0x25A1 if $c == 0;
345 push (@outtext, $c);
346 } else {
347 # error
348 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
349 }
350 $i += 2;
351 } else {
352 # error
353 print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
354 $i ++;
355 }
356 } else {
357 # error
358 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
359 $i ++;
360 }
361 }
362 return \@outtext;
363}
364
365sub transchar {
366 my ($encoding, $from) = @_;
367 my $high = ($from / 256) % 256;
368 my $low = $from % 256;
369
370 return 0 unless defined $unicode::translations{$encoding};
371
372 my $block = $unicode::translations{$encoding}->{'map'};
373
374 if (ref ($block->[$high]) ne "ARRAY") {
375 return 0;
376 }
377 return $block->[$high]->[$low];
378}
379
380# %translations is of the form:
381#
382# encodings{encodingname-encodingname}->{'map'}->blocktranslation
383# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
384#
385# Any of the top translation blocks can point to an undefined
386# value. This data structure aims to allow fast translation and
387# efficient storage.
388%unicode::translations = ();
389
390# @array256 is used for initialisation, there must be
391# a better way...
392# What about this?: @array256 = (0) x 256;
393@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
394 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
395 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
396 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
397 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
398 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
399 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
400 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
401 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
402 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
403 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
404 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
405 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
406 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
407 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
408 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
409
410# returns 1 if successful, 0 if unsuccessful
411sub loadmapencoding {
412 my ($encoding, $mapfile) = @_;
413
414 # check to see if the encoding has already been loaded
415 return 1 if (defined $unicode::translations{$encoding});
416
417 if (! -r $mapfile || -d $mapfile) {
418 return 0;
419 }
420 return 0 unless open (MAPFILE, $mapfile);
421 binmode (MAPFILE);
422
423 $unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
424 my $block = $unicode::translations{$encoding};
425
426 my ($in,$i,$j);
427 while (1) {
428 my $ret=read(MAPFILE, $in, 1);
429 if (!defined($ret)) { # error
430 print STDERR "unicode.pm: error reading mapfile: $!\n";
431 last;
432 }
433 if ($ret != 1) { last }
434 $i = unpack ("C", $in);
435 $block->{'map'}->[$i] = [@unicode::array256];
436 for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
437 my ($n1, $n2) = unpack ("CC", $in);
438 $block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
439 }
440 $block->{'count'} ++;
441 }
442
443 close (MAPFILE);
444}
445
446# unicode2singlebyte converts unicode to simple 8 bit encodings where
447# characters below 0x80 are normal ascii characters and the rest are encoded
448# using the appropriate mapping files.
449#
450# Examples of encodings that may be converted using unicode2singlebyte are
451# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
452sub unicode2singlebyte {
453 my ($uniref, $encoding) = @_;
454
455 my $outtext = "";
456 my $encodename = "unicode-$encoding";
457
458 if (!exists $encodings::encodings->{$encoding}) {
459 print STDERR "unicode.pm: ERROR - unsupported encoding "
460 . "'$encoding' requested\n";
461 return "";
462 }
463
464 my $enc_info = $encodings::encodings->{$encoding};
465 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
466 "from_uc", $enc_info->{'mapfile'});
467 if (!&loadmapencoding ($encodename, $mapfile)) {
468 print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
469 return "";
470 }
471
472 foreach my $c (@$uniref) {
473 if ($c < 0x80) {
474 # normal ascii character
475 $outtext .= chr($c);
476 } else {
477 # extended ascii character
478 $c = &transchar ($encodename, $c);
479
480 # put a question mark if cannot translate
481 if ($c == 0) {
482 $outtext .= "?";
483 } else {
484 $outtext .= chr($c);
485 }
486 }
487 }
488 return $outtext;
489}
490
491
492# this makes sure that the referenced input string is utf8 encoded, and
493# will change/remove bytes that aren't.
494# returns 0 if the text was already utf8, or 1 if text modified to become utf8
495sub ensure_utf8 {
496 my $stringref=shift;
497
498 if (!defined($stringref) || ref($stringref) ne 'SCALAR') {
499 return $stringref;
500 }
501
502 my $value=$$stringref;
503
504 my $non_utf8_found = 0;
505 $value =~ m/^/g; # to set \G
506 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
507 my $highbytes=$1;
508 my $highbyteslength=length($highbytes);
509 # make sure this block of high bytes is utf-8
510 $highbytes =~ /^/g; # set pos()
511 my $byte_replaced = 0;
512 while ($highbytes =~
513 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
514 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
515 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
516 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
517 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
518 )*([\x80-\xff])? !xg
519 ) {
520 # this highbyte is "out-of-place" for valid utf-8
521 my $badbyte=$1;
522 if (!defined $badbyte) {next} # hit end of string
523 my $pos=pos($highbytes);
524 # replace bad byte. assume iso-8859-1 -> utf-8
525 # ascii2utf8 does "extended ascii"... ie iso-8859-1
526 my $replacement=&unicode::ascii2utf8(\$badbyte);
527 substr($highbytes, $pos-1, 1, $replacement);
528 # update the position to continue searching (for \G)
529 pos($highbytes) = $pos+length($replacement)-1;
530 $byte_replaced = 1;
531 }
532 if ($byte_replaced) {
533 # replace this block of high bytes in the $value
534 $non_utf8_found = 1;
535 my $replength=length($highbytes); # we've changed the length
536 my $textpos=pos($value); # pos at end of last match
537 # replace bad bytes with good bytes
538 substr($value, $textpos-$highbyteslength,
539 $highbyteslength, $highbytes);
540 # update the position to continue searching (for \G)
541 pos($value)=$textpos+($replength-$highbyteslength)+1;
542 }
543 }
544
545 $$stringref = $value;
546 return $non_utf8_found;
547}
548
549# Returns true (1) if the given string is utf8 and false (0) if it isn't.
550# Does not modify the string parameter.
551sub check_is_utf8 {
552 my $value=shift;
553
554 if (!defined($value)) {
555 return 0; # not utf8 because it is undefined
556 }
557
558 $value =~ m/^/g; # to set \G
559 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
560 my $highbytes=$1;
561 # make sure this block of high bytes is utf-8
562 $highbytes =~ /^/g; # set pos()
563 while ($highbytes =~
564 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
565 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
566 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
567 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
568 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
569 )*([\x80-\xff])? !xg
570 ) {
571 my $badbyte=$1;
572 if (defined $badbyte) { # not end of string
573 return 0; # non-utf8 found
574 }
575 }
576 }
577
578 return 1;
579}
580
581sub url_encode {
582 my ($text) = @_;
583
584 if (!&is_url_encoded($text)) {
585 $text =~ s/([^A-Z0-9\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
586 # return the url-encoded character entity for underscore back to the entity
587 $text =~ s/%26%23095%3B/&\#095;/g;
588 }
589 return $text;
590}
591
592sub url_decode {
593 my ($text) = @_;
594
595 $text =~ s/\%([A-F0-9]{2})/pack('C', hex($1))/ige;
596 return $text;
597}
598
599sub is_url_encoded {
600 my ($text) = @_;
601 return ($text =~ m/\%([A-F0-9]{2})/);
602}
603
604# When a filename on the filesystem is already URL-encoded, the
605# URL to it will have %25s in place of every % sign, so that
606# URLs in html pages can refer to the URL-encoded filename.
607# This method changes the URL reference back into the actual
608# (URL-encoded) filename on the filesystem by replacing %25 with %.
609sub url_to_filename {
610 my ($text) =@_;
611 $text =~ s/%25/%/g if &is_url_encoded($text);
612 # DM safing would have replaced underscores with character entity &#095;
613 # in SourceFile meta. Undo any such change to get the filename referred to.
614 $text =~ s/&\#095;/_/g;
615 return $text;
616}
617
618# When a filename on the filesystem is already URL-encoded, the
619# URL to it will have %25s in place of every % sign, so that
620# URLs in html pages can refer to the URL-encoded filename.
621# Given a (URL-encoded) filename on the filesystem, this subroutine
622# returns the URL reference string for it by replacing % with %25.
623# The output string will be the same as the input string if the input
624# already contains one or more %25s. This is to prevent processing
625# a url more than once this way.
626sub filename_to_url {
627 my ($text) = @_;
628
629 if($text !~ m/%25/) {
630 $text =~ s/%/%25/g;
631 }
632 return $text;
633}
634
635sub base64_encode {
636 my ($text) = @_;
637 if(!&conforms_to_mod_base64($text)) {
638 # return entity for underscore to underscore before encoding
639 $text =~ s/&\#095;/_/g;
640
641 $text = &MIME::Base64::encode_base64($text);
642 # base64 encoding may introduce + and / signs,
643 # replacing them with - and _ to ensure it's filename-safe
644 $text =~ s/\+/\-/g; # + -> -
645 $text =~ s/\//\_/g; # / -> _
646 }
647 return $text;
648}
649
650# If the input fits the modified base64 pattern, this will try decoding it.
651# Still, this method does not guarantee the return value is the 'original', only
652# that the result is where the base64 decoding process has been applied once.
653# THIS METHOD IS NOT USED at the moment. It's here for convenience and symmetry.
654sub base64_decode {
655 my ($text) = @_;
656 if(&conforms_to_mod_base64($text)) {
657 # base64 encodes certain chars with + and /, but if we'd encoded it, we'd
658 # have replaced them with - and _ respectively. Undo this before decoding.
659 $text =~ s/\-/\+/g; # - -> +
660 $text =~ s/\_/\//g; # _ -> /
661 $text = &MIME::Base64::decode_base64($text);
662 }
663 return $text;
664}
665
666# Returns true if the given string is compatible with a modified version
667# of base64 (where the + and / are replaced with - and _), a format which
668# includes also regular ASCII alphanumeric values. This method does not
669# guarantee that the given string is actually base64 encoded, since it will
670# return true for any simple alphanumeric ASCII string as well.
671sub conforms_to_mod_base64 {
672 my ($text) = @_;
673
674 # need to treat the entity ref for underscore as underscore
675 $text =~ s/&\#095;/_/g;
676
677 # base 64 takes alphanumeric and [=+/],
678 # but we use modified base64 where + and / are replaced with - and _
679 return ($text =~ m/^[A-Za-z0-9\=\-\_]+$/); #alphanumeric and [=-_]
680}
681
682sub substr
683{
684 my ($utf8_string, $offset, $length) = @_;
685
686 my @unicode_string = @{&utf82unicode($utf8_string)};
687 my $unicode_string_length = scalar(@unicode_string);
688
689 my $substr_start = $offset;
690 if ($substr_start >= $unicode_string_length) {
691 return "";
692 }
693
694 my $substr_end = $offset + $length - 1;
695 if ($substr_end >= $unicode_string_length) {
696 $substr_end = $unicode_string_length - 1;
697 }
698
699 my @unicode_substring = @unicode_string[$substr_start..$substr_end];
700 return &unicode2utf8(\@unicode_substring);
701}
702
703
7041;
Note: See TracBrowser for help on using the repository browser.